Projet-2 Analyse des données ACADEMY
# Imports librairies
import sys
import warnings
import IPython as ip
import matplotlib.pyplot as plt
import missingno as msno
import sklearn as nltk
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
# import mytools as tools
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display
import geopandas as gpd
warnings.filterwarnings("ignore")
# Versions
print("Les version des librairies : ")
print("Python : " + sys.version)
print("Numpy : " + np.version.full_version)
print("Pandas : " + pd.__version__)
print("Seaborn : " + sns.__version__)
print("missingno : " + msno.__version__)
print("scikit-learn : " + nltk.__version__)
Les version des librairies : Python : 3.10.7 (tags/v3.10.7:6cc6b13, Sep 5 2022, 14:08:36) [MSC v.1933 64 bit (AMD64)] Numpy : 1.23.4 Pandas : 1.5.1 Seaborn : 0.12.1 missingno : 0.5.1 scikit-learn : 1.1.3
from jyquickhelper import add_notebook_menu
add_notebook_menu()
L'entreprise Academy est une start-up de la EdTech qui propose des contenus de formation pour un public de niveau lycée et universitaire qui a le projet d'étendre ses services à l'internationale.
Problématiques :
Mission : Réaliser une analyse pré-exploratoire de ce jeu de données.
Sources :
Pré-Analyse :
Démarches :
# On crée une fonction info_data pour les autres df (DRY)
def info_data(df):
print(
"------------------------------------------------------------------------------------------------------"
)
print(f"LES DONNES \n")
print("Nombre de lignes :", df.shape[0], "lignes")
print("Nombre de colonnes :", df.shape[1], "colonnes")
# Recherche de doublon
print(
f"Nombre d'éléments dupliqués dans le dataframe : {country.duplicated().sum()} eléments"
)
nb_na = df.isnull().sum().sum()
if nb_na > 0:
nb_data_tt = np.product(df.shape)
pourcent_na_tt = round((nb_na / nb_data_tt) * 100, 2)
print(
f"Nombre total de données manquantes dans le dataframe : {nb_na} données manquantes sur {nb_data_tt} ({pourcent_na_tt}%)"
)
else:
print("Aucune valeur manquante")
# On crée une fonction get_na (DRY)
def info_data_na(df):
nb_na = df.isnull().sum().sum()
nb_data_tt = np.product(df.shape)
pourcent_na_tt = round((nb_na / nb_data_tt) * 100, 2)
print(
f"Nombre total de données manquantes dans le dataframe : {nb_na} données manquantes sur {nb_data_tt} ({pourcent_na_tt}%)"
)
print(
"------------------------------------------------------------------------------------------------------"
)
print("Visualisation des données manquantes")
if nb_na > 0:
plt.figure(figsize=(20, 10))
plt.title("Visualisation des valeurs manquantes")
sns.heatmap(df.isna(), cbar=False)
plt.show()
# Import données
country = pd.read_csv("assets/data/EdStatsCountry.csv", sep=",")
# afficher les 5 premières et les 5 dernières lignes du dataframe
display(country)
| Country Code | Short Name | Table Name | Long Name | 2-alpha code | Currency Unit | Special Notes | Region | Income Group | WB-2 code | ... | IMF data dissemination standard | Latest population census | Latest household survey | Source of most recent Income and expenditure data | Vital registration complete | Latest agricultural census | Latest industrial data | Latest trade data | Latest water withdrawal data | Unnamed: 31 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ABW | Aruba | Aruba | Aruba | AW | Aruban florin | SNA data for 2000-2011 are updated from offici... | Latin America & Caribbean | High income: nonOECD | AW | ... | NaN | 2010 | NaN | NaN | Yes | NaN | NaN | 2012.0 | NaN | NaN |
| 1 | AFG | Afghanistan | Afghanistan | Islamic State of Afghanistan | AF | Afghan afghani | Fiscal year end: March 20; reporting period fo... | South Asia | Low income | AF | ... | General Data Dissemination System (GDDS) | 1979 | Multiple Indicator Cluster Survey (MICS), 2010/11 | Integrated household survey (IHS), 2008 | NaN | 2013/14 | NaN | 2012.0 | 2000 | NaN |
| 2 | AGO | Angola | Angola | People's Republic of Angola | AO | Angolan kwanza | April 2013 database update: Based on IMF data,... | Sub-Saharan Africa | Upper middle income | AO | ... | General Data Dissemination System (GDDS) | 1970 | Malaria Indicator Survey (MIS), 2011 | Integrated household survey (IHS), 2008 | NaN | 2015 | NaN | NaN | 2005 | NaN |
| 3 | ALB | Albania | Albania | Republic of Albania | AL | Albanian lek | NaN | Europe & Central Asia | Upper middle income | AL | ... | General Data Dissemination System (GDDS) | 2011 | Demographic and Health Survey (DHS), 2008/09 | Living Standards Measurement Study Survey (LSM... | Yes | 2012 | 2010.0 | 2012.0 | 2006 | NaN |
| 4 | AND | Andorra | Andorra | Principality of Andorra | AD | Euro | NaN | Europe & Central Asia | High income: nonOECD | AD | ... | NaN | 2011. Population figures compiled from adminis... | NaN | NaN | Yes | NaN | NaN | 2006.0 | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 236 | XKX | Kosovo | Kosovo | Republic of Kosovo | NaN | Euro | Kosovo became a World Bank member on June 29, ... | Europe & Central Asia | Lower middle income | KV | ... | General Data Dissemination System (GDDS) | 2011 | NaN | Integrated household survey (IHS), 2011 | NaN | NaN | NaN | NaN | NaN | NaN |
| 237 | YEM | Yemen | Yemen, Rep. | Republic of Yemen | YE | Yemeni rial | Based on official government statistics and In... | Middle East & North Africa | Lower middle income | RY | ... | General Data Dissemination System (GDDS) | 2004 | Demographic and Health Survey (DHS), 2013 | Expenditure survey/budget survey (ES/BS), 2005 | NaN | NaN | 2006.0 | 2012.0 | 2005 | NaN |
| 238 | ZAF | South Africa | South Africa | Republic of South Africa | ZA | South African rand | Fiscal year end: March 31; reporting period fo... | Sub-Saharan Africa | Upper middle income | ZA | ... | Special Data Dissemination Standard (SDDS) | 2011 | Demographic and Health Survey (DHS), 2003; Wor... | Expenditure survey/budget survey (ES/BS), 2010 | NaN | 2007 | 2010.0 | 2012.0 | 2000 | NaN |
| 239 | ZMB | Zambia | Zambia | Republic of Zambia | ZM | New Zambian kwacha | National accounts data have rebased to reflect... | Sub-Saharan Africa | Lower middle income | ZM | ... | General Data Dissemination System (GDDS) | 2010 | Demographic and Health Survey (DHS), 2013 | Integrated household survey (IHS), 2010 | NaN | 2010. Population and Housing Census. | NaN | 2011.0 | 2002 | NaN |
| 240 | ZWE | Zimbabwe | Zimbabwe | Republic of Zimbabwe | ZW | U.S. dollar | Fiscal year end: June 30; reporting period for... | Sub-Saharan Africa | Low income | ZW | ... | General Data Dissemination System (GDDS) | 2012 | Demographic and Health Survey (DHS), 2010/11 | Integrated household survey (IHS), 2011/12 | NaN | NaN | NaN | 2012.0 | 2002 | NaN |
241 rows × 32 columns
# Résumé des variables qualitatives
country.describe(exclude=[np.number])
| Country Code | Short Name | Table Name | Long Name | 2-alpha code | Currency Unit | Special Notes | Region | Income Group | WB-2 code | ... | External debt Reporting status | System of trade | Government Accounting concept | IMF data dissemination standard | Latest population census | Latest household survey | Source of most recent Income and expenditure data | Vital registration complete | Latest agricultural census | Latest water withdrawal data | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 241 | 241 | 241 | 241 | 238 | 215 | 145 | 214 | 214 | 240 | ... | 124 | 200 | 161 | 181 | 213 | 141 | 160 | 111 | 142 | 179 |
| unique | 241 | 241 | 241 | 241 | 238 | 152 | 131 | 7 | 5 | 240 | ... | 3 | 2 | 2 | 2 | 27 | 60 | 75 | 2 | 35 | 20 |
| top | ABW | Aruba | Aruba | Aruba | AW | Euro | April 2012 database update: Based on official ... | Europe & Central Asia | Upper middle income | AW | ... | Actual | General trade system | Consolidated central government | General Data Dissemination System (GDDS) | 2011 | World Health Survey (WHS), 2003 | Integrated household survey (IHS), 2012 | Yes | 2010 | 2000 |
| freq | 1 | 1 | 1 | 1 | 1 | 23 | 6 | 57 | 55 | 1 | ... | 107 | 106 | 95 | 110 | 59 | 10 | 15 | 110 | 36 | 40 |
4 rows × 28 columns
# Résumé des variables quantitatives
country.describe()
| National accounts reference year | Latest industrial data | Latest trade data | Unnamed: 31 | |
|---|---|---|---|---|
| count | 32.00000 | 107.000000 | 185.000000 | 0.0 |
| mean | 2001.53125 | 2008.102804 | 2010.994595 | NaN |
| std | 5.24856 | 2.616834 | 2.569675 | NaN |
| min | 1987.00000 | 2000.000000 | 1995.000000 | NaN |
| 25% | 1996.75000 | 2007.500000 | 2011.000000 | NaN |
| 50% | 2002.00000 | 2009.000000 | 2012.000000 | NaN |
| 75% | 2005.00000 | 2010.000000 | 2012.000000 | NaN |
| max | 2012.00000 | 2010.000000 | 2012.000000 | NaN |
# On regarde les types des variables
country.dtypes
Country Code object Short Name object Table Name object Long Name object 2-alpha code object Currency Unit object Special Notes object Region object Income Group object WB-2 code object National accounts base year object National accounts reference year float64 SNA price valuation object Lending category object Other groups object System of National Accounts object Alternative conversion factor object PPP survey year object Balance of Payments Manual in use object External debt Reporting status object System of trade object Government Accounting concept object IMF data dissemination standard object Latest population census object Latest household survey object Source of most recent Income and expenditure data object Vital registration complete object Latest agricultural census object Latest industrial data float64 Latest trade data float64 Latest water withdrawal data object Unnamed: 31 float64 dtype: object
# Premières informations sur les données du dataframe
info_data(country)
------------------------------------------------------------------------------------------------------ LES DONNES Nombre de lignes : 241 lignes Nombre de colonnes : 32 colonnes Nombre d'éléments dupliqués dans le dataframe : 0 eléments Nombre total de données manquantes dans le dataframe : 2354 données manquantes sur 7712 (30.52%)
# Lister les noms de colonne
country.columns
Index(['Country Code', 'Short Name', 'Table Name', 'Long Name', '2-alpha code',
'Currency Unit', 'Special Notes', 'Region', 'Income Group', 'WB-2 code',
'National accounts base year', 'National accounts reference year',
'SNA price valuation', 'Lending category', 'Other groups',
'System of National Accounts', 'Alternative conversion factor',
'PPP survey year', 'Balance of Payments Manual in use',
'External debt Reporting status', 'System of trade',
'Government Accounting concept', 'IMF data dissemination standard',
'Latest population census', 'Latest household survey',
'Source of most recent Income and expenditure data',
'Vital registration complete', 'Latest agricultural census',
'Latest industrial data', 'Latest trade data',
'Latest water withdrawal data', 'Unnamed: 31'],
dtype='object')
# Nombre de valeurs manquantes par variable en %
round(country.isnull().mean() * 100, 2)
Country Code 0.00 Short Name 0.00 Table Name 0.00 Long Name 0.00 2-alpha code 1.24 Currency Unit 10.79 Special Notes 39.83 Region 11.20 Income Group 11.20 WB-2 code 0.41 National accounts base year 14.94 National accounts reference year 86.72 SNA price valuation 18.26 Lending category 40.25 Other groups 75.93 System of National Accounts 10.79 Alternative conversion factor 80.50 PPP survey year 39.83 Balance of Payments Manual in use 24.90 External debt Reporting status 48.55 System of trade 17.01 Government Accounting concept 33.20 IMF data dissemination standard 24.90 Latest population census 11.62 Latest household survey 41.49 Source of most recent Income and expenditure data 33.61 Vital registration complete 53.94 Latest agricultural census 41.08 Latest industrial data 55.60 Latest trade data 23.24 Latest water withdrawal data 25.73 Unnamed: 31 100.00 dtype: float64
# Information et visualisation des données manquantes
info_data_na(country)
Nombre total de données manquantes dans le dataframe : 2354 données manquantes sur 7712 (30.52%) ------------------------------------------------------------------------------------------------------ Visualisation des données manquantes
# Identifier les différentes manières de nommer les pays + Regfion et revenu
country[["Country Code", "Short Name","Table Name","Long Name","2-alpha code","WB-2 code" ,"Currency Unit","Region", "Income Group"]]
| Country Code | Short Name | Table Name | Long Name | 2-alpha code | WB-2 code | Currency Unit | Region | Income Group | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | ABW | Aruba | Aruba | Aruba | AW | AW | Aruban florin | Latin America & Caribbean | High income: nonOECD |
| 1 | AFG | Afghanistan | Afghanistan | Islamic State of Afghanistan | AF | AF | Afghan afghani | South Asia | Low income |
| 2 | AGO | Angola | Angola | People's Republic of Angola | AO | AO | Angolan kwanza | Sub-Saharan Africa | Upper middle income |
| 3 | ALB | Albania | Albania | Republic of Albania | AL | AL | Albanian lek | Europe & Central Asia | Upper middle income |
| 4 | AND | Andorra | Andorra | Principality of Andorra | AD | AD | Euro | Europe & Central Asia | High income: nonOECD |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 236 | XKX | Kosovo | Kosovo | Republic of Kosovo | NaN | KV | Euro | Europe & Central Asia | Lower middle income |
| 237 | YEM | Yemen | Yemen, Rep. | Republic of Yemen | YE | RY | Yemeni rial | Middle East & North Africa | Lower middle income |
| 238 | ZAF | South Africa | South Africa | Republic of South Africa | ZA | ZA | South African rand | Sub-Saharan Africa | Upper middle income |
| 239 | ZMB | Zambia | Zambia | Republic of Zambia | ZM | ZM | New Zambian kwacha | Sub-Saharan Africa | Lower middle income |
| 240 | ZWE | Zimbabwe | Zimbabwe | Republic of Zimbabwe | ZW | ZW | U.S. dollar | Sub-Saharan Africa | Low income |
241 rows × 9 columns
# Lister les regions
print("Régions :\n", country["Region"].unique())
Régions : ['Latin America & Caribbean' 'South Asia' 'Sub-Saharan Africa' 'Europe & Central Asia' nan 'Middle East & North Africa' 'East Asia & Pacific' 'North America']
print(country["Region"].nunique(), "regions")
7 regions
# compter et Lister les pays
print("Pays :\n", country["Short Name"].nunique(), "pays")
print(
"Le fichier présente 241 données géographiques comprennant des pays et dses zones géographiques"
)
Pays : 241 pays Le fichier présente 241 données géographiques comprennant des pays et dses zones géographiques
# Dénominations pays
country[
[
"Country Code",
"Short Name",
"Table Name",
"Long Name",
"2-alpha code",
"WB-2 code",
]
]
| Country Code | Short Name | Table Name | Long Name | 2-alpha code | WB-2 code | |
|---|---|---|---|---|---|---|
| 0 | ABW | Aruba | Aruba | Aruba | AW | AW |
| 1 | AFG | Afghanistan | Afghanistan | Islamic State of Afghanistan | AF | AF |
| 2 | AGO | Angola | Angola | People's Republic of Angola | AO | AO |
| 3 | ALB | Albania | Albania | Republic of Albania | AL | AL |
| 4 | AND | Andorra | Andorra | Principality of Andorra | AD | AD |
| ... | ... | ... | ... | ... | ... | ... |
| 236 | XKX | Kosovo | Kosovo | Republic of Kosovo | NaN | KV |
| 237 | YEM | Yemen | Yemen, Rep. | Republic of Yemen | YE | RY |
| 238 | ZAF | South Africa | South Africa | Republic of South Africa | ZA | ZA |
| 239 | ZMB | Zambia | Zambia | Republic of Zambia | ZM | ZM |
| 240 | ZWE | Zimbabwe | Zimbabwe | Republic of Zimbabwe | ZW | ZW |
241 rows × 6 columns
# https://data.worldbank.org/country
# On liste les zones contenu dans le df
areas = [
"Arab World",
"East Asia & Pacific",
"East Asia & Pacific (excluding high income)",
"East Asia & Pacific (developing only)",
"East Asia & Pacific (all income levels)",
"Euro area",
"European Union",
"Europe & Centraµl Asia",
"Europe & Central Asia (excluding high income)",
"Europe & Central Asia (developing only)",
"Europe & Central Asia (all income levels)",
"European Union",
"Heavily indebted poor countries (HIPC)",
"High income",
"Latin America & Caribbean",
"Latin America & Caribbean (developing only)",
"Latin America & Caribbean (excluding high income)",
"Least developed countries: UN classification",
"Low & middle income",
"Low income",
"Lower middle income",
"Middle East & North Africa",
"Middle East & North Africa (excluding high income)",
"Middle East & North Africa (all income levels)",
"Middle East & North Africa (developing only)",
"Middle income",
"North America",
"OECD members",
"South Asia",
"Sub-Saharan Africa",
"Sub-Saharan Africa (excluding high income)",
"Upper middle income",
"Sub-Saharan Africa (developing only)",
"Sub-Saharan Africa (all income levels)",
"World",
]
# Compter le nombre de pays qui ne sont pas des zones géographiques
country[~country["Short Name"].isin(areas)]["Short Name"].unique().shape
(217,)
# Créer la liste des pays présent dans country
list_countries = (
country[~country["Short Name"].isin(areas)]["Short Name"].unique().tolist()
)
print(list_countries)
['Aruba', 'Afghanistan', 'Angola', 'Albania', 'Andorra', 'United Arab Emirates', 'Argentina', 'Armenia', 'American Samoa', 'Antigua and Barbuda', 'Australia', 'Austria', 'Azerbaijan', 'Burundi', 'Belgium', 'Benin', 'Burkina Faso', 'Bangladesh', 'Bulgaria', 'Bahrain', 'The Bahamas', 'Bosnia and Herzegovina', 'Belarus', 'Belize', 'Bermuda', 'Bolivia', 'Brazil', 'Barbados', 'Brunei', 'Bhutan', 'Botswana', 'Central African Republic', 'Canada', 'Switzerland', 'Channel Islands', 'Chile', 'China', "Côte d'Ivoire", 'Cameroon', 'Dem. Rep. Congo', 'Congo', 'Colombia', 'Comoros', 'Cabo Verde', 'Costa Rica', 'Cuba', 'Curaçao', 'Cayman Islands', 'Cyprus', 'Czech Republic', 'Germany', 'Djibouti', 'Dominica', 'Denmark', 'Dominican Republic', 'Algeria', 'Ecuador', 'Egypt', 'Eritrea', 'Spain', 'Estonia', 'Ethiopia', 'Finland', 'Fiji', 'France', 'Faeroe Islands', 'Micronesia', 'Gabon', 'United Kingdom', 'Georgia', 'Ghana', 'Gibraltar', 'Guinea', 'The Gambia', 'Guinea-Bissau', 'Equatorial Guinea', 'Greece', 'Grenada', 'Greenland', 'Guatemala', 'Guam', 'Guyana', 'Hong Kong SAR, China', 'Honduras', 'Croatia', 'Haiti', 'Hungary', 'Indonesia', 'Isle of Man', 'India', 'Ireland', 'Iran', 'Iraq', 'Iceland', 'Israel', 'Italy', 'Jamaica', 'Jordan', 'Japan', 'Kazakhstan', 'Kenya', 'Kyrgyz Republic', 'Cambodia', 'Kiribati', 'St. Kitts and Nevis', 'Korea', 'Kuwait', 'Lao PDR', 'Lebanon', 'Liberia', 'Libya', 'St. Lucia', 'Latin America & Caribbean (all income levels)', 'Liechtenstein', 'Sri Lanka', 'Lesotho', 'Lithuania', 'Luxembourg', 'Latvia', 'Macao SAR, China', 'St. Martin (French part)', 'Morocco', 'Monaco', 'Moldova', 'Madagascar', 'Maldives', 'Mexico', 'Marshall Islands', 'Macedonia', 'Mali', 'Malta', 'Myanmar', 'Montenegro', 'Mongolia', 'Northern Mariana Islands', 'Mozambique', 'Mauritania', 'Mauritius', 'Malawi', 'Malaysia', 'Namibia', 'New Caledonia', 'Niger', 'Nigeria', 'Nicaragua', 'Netherlands', 'Norway', 'Nepal', 'Nauru', 'New Zealand', 'Oman', 'Pakistan', 'Panama', 'Peru', 'Philippines', 'Palau', 'Papua New Guinea', 'Poland', 'Puerto Rico', "Dem. People's Rep. Korea", 'Portugal', 'Paraguay', 'West Bank and Gaza', 'French Polynesia', 'Qatar', 'Romania', 'Russia', 'Rwanda', 'Saudi Arabia', 'Sudan', 'Senegal', 'Singapore', 'Solomon Islands', 'Sierra Leone', 'El Salvador', 'San Marino', 'Somalia', 'Serbia', 'South Sudan', 'São Tomé and Principe', 'Suriname', 'Slovak Republic', 'Slovenia', 'Sweden', 'Swaziland', 'Sint Maarten (Dutch part)', 'Seychelles', 'Syrian Arab Republic', 'Turks and Caicos Islands', 'Chad', 'Togo', 'Thailand', 'Tajikistan', 'Turkmenistan', 'Timor-Leste', 'Tonga', 'Trinidad and Tobago', 'Tunisia', 'Turkey', 'Tuvalu', 'Tanzania', 'Uganda', 'Ukraine', 'Uruguay', 'United States', 'Uzbekistan', 'St. Vincent and the Grenadines', 'Venezuela', 'Virgin Islands', 'Vietnam', 'Vanuatu', 'Samoa', 'Kosovo', 'Yemen', 'South Africa', 'Zambia', 'Zimbabwe']
# Compter le nombre d'éléments dans la liste pour avoir le nombre de pays
len(list_countries)
217
print(country["Short Name"].unique())
['Aruba' 'Afghanistan' 'Angola' 'Albania' 'Andorra' 'Arab World' 'United Arab Emirates' 'Argentina' 'Armenia' 'American Samoa' 'Antigua and Barbuda' 'Australia' 'Austria' 'Azerbaijan' 'Burundi' 'Belgium' 'Benin' 'Burkina Faso' 'Bangladesh' 'Bulgaria' 'Bahrain' 'The Bahamas' 'Bosnia and Herzegovina' 'Belarus' 'Belize' 'Bermuda' 'Bolivia' 'Brazil' 'Barbados' 'Brunei' 'Bhutan' 'Botswana' 'Central African Republic' 'Canada' 'Switzerland' 'Channel Islands' 'Chile' 'China' "Côte d'Ivoire" 'Cameroon' 'Dem. Rep. Congo' 'Congo' 'Colombia' 'Comoros' 'Cabo Verde' 'Costa Rica' 'Cuba' 'Curaçao' 'Cayman Islands' 'Cyprus' 'Czech Republic' 'Germany' 'Djibouti' 'Dominica' 'Denmark' 'Dominican Republic' 'Algeria' 'East Asia & Pacific (developing only)' 'East Asia & Pacific (all income levels)' 'Europe & Central Asia (developing only)' 'Europe & Central Asia (all income levels)' 'Ecuador' 'Egypt' 'Euro area' 'Eritrea' 'Spain' 'Estonia' 'Ethiopia' 'European Union' 'Finland' 'Fiji' 'France' 'Faeroe Islands' 'Micronesia' 'Gabon' 'United Kingdom' 'Georgia' 'Ghana' 'Gibraltar' 'Guinea' 'The Gambia' 'Guinea-Bissau' 'Equatorial Guinea' 'Greece' 'Grenada' 'Greenland' 'Guatemala' 'Guam' 'Guyana' 'High income' 'Hong Kong SAR, China' 'Honduras' 'Heavily indebted poor countries (HIPC)' 'Croatia' 'Haiti' 'Hungary' 'Indonesia' 'Isle of Man' 'India' 'Ireland' 'Iran' 'Iraq' 'Iceland' 'Israel' 'Italy' 'Jamaica' 'Jordan' 'Japan' 'Kazakhstan' 'Kenya' 'Kyrgyz Republic' 'Cambodia' 'Kiribati' 'St. Kitts and Nevis' 'Korea' 'Kuwait' 'Latin America & Caribbean (developing only)' 'Lao PDR' 'Lebanon' 'Liberia' 'Libya' 'St. Lucia' 'Latin America & Caribbean (all income levels)' 'Least developed countries: UN classification' 'Low income' 'Liechtenstein' 'Sri Lanka' 'Lower middle income' 'Low & middle income' 'Lesotho' 'Lithuania' 'Luxembourg' 'Latvia' 'Macao SAR, China' 'St. Martin (French part)' 'Morocco' 'Monaco' 'Moldova' 'Madagascar' 'Maldives' 'Middle East & North Africa (all income levels)' 'Mexico' 'Marshall Islands' 'Middle income' 'Macedonia' 'Mali' 'Malta' 'Myanmar' 'Middle East & North Africa (developing only)' 'Montenegro' 'Mongolia' 'Northern Mariana Islands' 'Mozambique' 'Mauritania' 'Mauritius' 'Malawi' 'Malaysia' 'North America' 'Namibia' 'New Caledonia' 'Niger' 'Nigeria' 'Nicaragua' 'Netherlands' 'Norway' 'Nepal' 'Nauru' 'New Zealand' 'OECD members' 'Oman' 'Pakistan' 'Panama' 'Peru' 'Philippines' 'Palau' 'Papua New Guinea' 'Poland' 'Puerto Rico' "Dem. People's Rep. Korea" 'Portugal' 'Paraguay' 'West Bank and Gaza' 'French Polynesia' 'Qatar' 'Romania' 'Russia' 'Rwanda' 'South Asia' 'Saudi Arabia' 'Sudan' 'Senegal' 'Singapore' 'Solomon Islands' 'Sierra Leone' 'El Salvador' 'San Marino' 'Somalia' 'Serbia' 'Sub-Saharan Africa (developing only)' 'South Sudan' 'Sub-Saharan Africa (all income levels)' 'São Tomé and Principe' 'Suriname' 'Slovak Republic' 'Slovenia' 'Sweden' 'Swaziland' 'Sint Maarten (Dutch part)' 'Seychelles' 'Syrian Arab Republic' 'Turks and Caicos Islands' 'Chad' 'Togo' 'Thailand' 'Tajikistan' 'Turkmenistan' 'Timor-Leste' 'Tonga' 'Trinidad and Tobago' 'Tunisia' 'Turkey' 'Tuvalu' 'Tanzania' 'Uganda' 'Ukraine' 'Upper middle income' 'Uruguay' 'United States' 'Uzbekistan' 'St. Vincent and the Grenadines' 'Venezuela' 'Virgin Islands' 'Vietnam' 'Vanuatu' 'World' 'Samoa' 'Kosovo' 'Yemen' 'South Africa' 'Zambia' 'Zimbabwe']
# On recherche le nombre de pays par région
nb_pays_par_region = (
country.groupby("Region")[["Short Name"]]
.count()
.reset_index()
.sort_values(by="Short Name", ascending=False)
)
nb_pays_par_region
| Region | Short Name | |
|---|---|---|
| 1 | Europe & Central Asia | 57 |
| 6 | Sub-Saharan Africa | 48 |
| 2 | Latin America & Caribbean | 41 |
| 0 | East Asia & Pacific | 36 |
| 3 | Middle East & North Africa | 21 |
| 5 | South Asia | 8 |
| 4 | North America | 3 |
# On visualise le nombre de pays par région
df = (
country.groupby("Region")[["Short Name"]]
.count()["Short Name"]
.sort_values(ascending=True)
)
val = df.tolist()
plt.figure(figsize=(4, 4))
plt.title("Nombre de pays par région", fontname="Arial", fontsize=18, fontweight="bold")
plt.barh(
y=country.groupby("Region")[["Short Name"]]
.count()
.reset_index()
.sort_values(by="Short Name")["Region"],
width=country.groupby("Region")[["Short Name"]]
.count()["Short Name"]
.sort_values(ascending=True),
)
# plt.grid(axis='x')
# On place les nombre sur le graphe 1 par 1
plt.text(val[6] - 5.5, 5.8, val[6], color="white", fontweight="bold")
plt.text(val[5] - 5.5, 4.8, val[5], color="white", fontweight="bold")
plt.text(val[4] - 5.5, 3.8, val[4], color="white", fontweight="bold")
plt.text(val[3] - 5.5, 2.8, val[3], color="white", fontweight="bold")
plt.text(val[2] - 5.5, 1.8, val[2], color="white", fontweight="bold")
plt.text(val[1] - 3, 0.8, val[1], color="white", fontweight="bold")
plt.text(val[0] - 2.5, -0.2, val[0], color="white", fontweight="bold")
plt.savefig('assets/graphiques/EDA_1_Nombre de pays par région.jpg', bbox_inches='tight')
plt.show()
# On liste les indicateurs de revenus
country["Income Group"].unique()
array(['High income: nonOECD', 'Low income', 'Upper middle income', nan,
'Lower middle income', 'High income: OECD'], dtype=object)
# On compte les types de revenu
country["Income Group"].nunique()
5
# On compte le nombre de nan dans la colonne Income Group
print("Il y a",country["Income Group"].isnull().sum(),"données manquantes dans Income Group qui correspondent aux zones ou groupes géographiques")
Il y a 27 données manquantes dans Income Group qui correspondent aux zones ou groupes géographiques
# Afficher les na de la colonne Income Group
country.loc[country["Income Group"].isna() == True, :]
| Country Code | Short Name | Table Name | Long Name | 2-alpha code | Currency Unit | Special Notes | Region | Income Group | WB-2 code | ... | IMF data dissemination standard | Latest population census | Latest household survey | Source of most recent Income and expenditure data | Vital registration complete | Latest agricultural census | Latest industrial data | Latest trade data | Latest water withdrawal data | Unnamed: 31 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5 | ARB | Arab World | Arab World | Arab World | 1A | NaN | Arab World aggregate. Arab World is composed o... | NaN | NaN | 1A | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 57 | EAP | East Asia & Pacific (developing only) | East Asia & Pacific | East Asia & Pacific (developing only) | 4E | NaN | East Asia and Pacific regional aggregate (does... | NaN | NaN | 4E | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 58 | EAS | East Asia & Pacific (all income levels) | East Asia & Pacific (all income levels) | East Asia & Pacific (all income levels) | Z4 | NaN | East Asia and Pacific regional aggregate (incl... | NaN | NaN | Z4 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 59 | ECA | Europe & Central Asia (developing only) | Europe & Central Asia | Europe & Central Asia (developing only) | 7E | NaN | Europe and Central Asia regional aggregate (do... | NaN | NaN | 7E | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 60 | ECS | Europe & Central Asia (all income levels) | Europe & Central Asia (all income levels) | Europe & Central Asia (all income levels) | Z7 | NaN | Europe and Central Asia regional aggregate (in... | NaN | NaN | Z7 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 63 | EMU | Euro area | Euro area | Euro area | XC | NaN | Euro area aggregate. | NaN | NaN | XC | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 68 | EUU | European Union | European Union | European Union | EU | NaN | European Union aggregate. | NaN | NaN | EU | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 78 | GIB | Gibraltar | Gibraltar | Gibraltar | GI | Gibraltar pound | NaN | NaN | NaN | GI | ... | NaN | 2012 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 89 | HIC | High income | High income | High income | XD | NaN | High income group aggregate. High-income econo... | NaN | NaN | XD | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 92 | HPC | Heavily indebted poor countries (HIPC) | Heavily indebted poor countries (HIPC) | Heavily indebted poor countries (HIPC) | XE | NaN | Heavily indebted poor countries aggregate. | NaN | NaN | XE | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 116 | LAC | Latin America & Caribbean (developing only) | Latin America & Caribbean | Latin America & Caribbean (developing only) | XJ | NaN | Latin America and Caribbean regional aggregate... | NaN | NaN | XJ | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 122 | LCN | Latin America & Caribbean (all income levels) | Latin America & Caribbean (all income levels) | Latin America & Caribbean (all income levels) | ZJ | NaN | Latin America and Caribbean regional aggregate... | NaN | NaN | ZJ | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 123 | LDC | Least developed countries: UN classification | Least developed countries: UN classification | Least developed countries: UN classification | XL | NaN | Least developed countries (UN classification) ... | NaN | NaN | XL | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 124 | LIC | Low income | Low income | Low income | XM | NaN | Low income group aggregate. Low-income economi... | NaN | NaN | XM | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 127 | LMC | Lower middle income | Lower middle income | Lower middle income | XN | NaN | Lower middle income group aggregate. Lower-mid... | NaN | NaN | XN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 128 | LMY | Low & middle income | Low & middle income | Low & middle income | XO | NaN | Low and middle income group aggregate (all dev... | NaN | NaN | XO | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 140 | MEA | Middle East & North Africa (all income levels) | Middle East & North Africa (all income levels) | Middle East & North Africa (all income levels) | ZQ | NaN | Middle East and North Africa regional aggregat... | NaN | NaN | ZQ | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 143 | MIC | Middle income | Middle income | Middle income | XP | NaN | Middle income group aggregate. Middle-income e... | NaN | NaN | XP | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 148 | MNA | Middle East & North Africa (developing only) | Middle East & North Africa | Middle East & North Africa (developing only) | XQ | NaN | Middle East and North Africa regional aggregat... | NaN | NaN | XQ | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 157 | NAC | North America | North America | North America | XU | NaN | North America regional aggregate. There are no... | NaN | NaN | XU | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 166 | NRU | Nauru | Nauru | Nauru | NR | NaN | NaN | NaN | NaN | NR | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 168 | OED | OECD members | OECD members | OECD members | OE | NaN | OECD members aggregate (all, including develop... | NaN | NaN | OE | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 187 | SAS | South Asia | South Asia | South Asia | 8S | NaN | South Asia regional aggregate. There are no ec... | NaN | NaN | 8S | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 198 | SSA | Sub-Saharan Africa (developing only) | Sub-Saharan Africa | Sub-Saharan Africa (developing only) | ZF | NaN | Sub-Saharan Africa regional aggregate (does no... | NaN | NaN | ZF | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 200 | SSF | Sub-Saharan Africa (all income levels) | Sub-Saharan Africa (all income levels) | Sub-Saharan Africa (all income levels) | ZG | NaN | Sub-Saharan Africa regional aggregate (includi... | NaN | NaN | ZG | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 225 | UMC | Upper middle income | Upper middle income | Upper middle income | XT | NaN | Upper middle income group aggregate. Upper-mid... | NaN | NaN | XT | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 234 | WLD | World | World | World | 1W | NaN | World aggregate. | NaN | NaN | 1W | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
27 rows × 32 columns
# On calcule le nombre de pays par région
nb_pays_par_groupe_revenu = (
country.groupby("Income Group")[["Short Name"]]
.count()
.reset_index()
.sort_values(by="Short Name", ascending=False)
)
nb_pays_par_groupe_revenu
| Income Group | Short Name | |
|---|---|---|
| 4 | Upper middle income | 55 |
| 3 | Lower middle income | 50 |
| 1 | High income: nonOECD | 44 |
| 2 | Low income | 34 |
| 0 | High income: OECD | 31 |
# On calcule le nombre de pays par groupe de revenu
nb_pays_par_groupe_revenu = nb_pays_par_groupe_revenu.sort_values(
by="Short Name", ascending=False
)
nb_pays_par_groupe_revenu
| Income Group | Short Name | |
|---|---|---|
| 4 | Upper middle income | 55 |
| 3 | Lower middle income | 50 |
| 1 | High income: nonOECD | 44 |
| 2 | Low income | 34 |
| 0 | High income: OECD | 31 |
# On visualise le nombre de pays par groupes de revenus
plt.figure(figsize=(15, 7))
plt.barh(
y=nb_pays_par_groupe_revenu["Income Group"],
width=country.groupby("Income Group")[["Short Name"]]
.count()["Short Name"]
.sort_values(),
)
plt.title(
"Nombre de pays par groupe de revenu",
fontname="Arial",
fontsize=18,
fontweight="bold",
)
# On place les chiffres sur le graphe avec un boucle
for i in range(5):
plt.text(
82 - (nb_pays_par_groupe_revenu.iloc[i, 1]),
i - 0.1,
nb_pays_par_groupe_revenu.iloc[4 - i, 1],
color="white",
fontweight="bold",
fontsize=20,
)
plt.savefig('assets/graphiques/EDA_2_Nombre de pays par groupe de revenu.jpg', bbox_inches='tight')
plt.show()
# On calcule le nombre de pays par groupe de monnaie = monnaie ma plus utilisée
nb_pays_par_groupe_monnaie = (
country.groupby("Currency Unit")[["Short Name"]]
.count()
.reset_index()
.sort_values(by="Short Name", ascending=False)
)
nb_pays_par_groupe_monnaie.head(6)
| Currency Unit | Short Name | |
|---|---|---|
| 46 | Euro | 23 |
| 143 | U.S. dollar | 14 |
| 23 | CFA franc | 14 |
| 42 | East Caribbean dollar | 6 |
| 114 | Pound sterling | 3 |
| 38 | Danish krone | 3 |
# On visualise les 8 monnaies les plus utilisée
plt.figure(figsize=(8, 4))
plt.barh(
y=country.groupby("Currency Unit")[["Short Name"]]
.count()
.reset_index()
.sort_values(by="Short Name")["Currency Unit"]
.tail(8),
width=country.groupby("Currency Unit")[["Short Name"]]
.count()["Short Name"]
.sort_values()
.tail(8),
)
plt.title(
"Devises les plus utilisées", fontname="Arial", fontsize=14, fontweight="bold"
)
plt.text(
21.5,
6.74,
nb_pays_par_groupe_monnaie.iloc[0, 1],
color="white",
fontweight="bold",
fontsize=16,
)
plt.text(
12.5,
5.74,
nb_pays_par_groupe_monnaie.iloc[1, 1],
color="white",
fontweight="bold",
fontsize=16,
)
plt.text(
12.5,
4.74,
nb_pays_par_groupe_monnaie.iloc[2, 1],
color="white",
fontweight="bold",
fontsize=16,
)
plt.text(
5,
3.74,
nb_pays_par_groupe_monnaie.iloc[3, 1],
color="white",
fontweight="bold",
fontsize=16,
)
plt.text(
2.2,
2.78,
nb_pays_par_groupe_monnaie.iloc[4, 1],
color="white",
fontweight="bold",
fontsize=16,
)
plt.text(
2.2,
1.78,
nb_pays_par_groupe_monnaie.iloc[5, 1],
color="white",
fontweight="bold",
fontsize=16,
)
plt.text(
2.2,
0.78,
nb_pays_par_groupe_monnaie.iloc[6, 1],
color="white",
fontweight="bold",
fontsize=16,
)
plt.text(
1.2,
-0.24,
nb_pays_par_groupe_monnaie.iloc[7, 1],
color="white",
fontweight="bold",
fontsize=16,
)
plt.savefig('assets/graphiques/EDA_3_Devises les plus utilisées.jpg', bbox_inches='tight')
plt.show()
# Regroupement des pays par groupe de revenus
sns.countplot(
y="Income Group",
hue="Region",
data=country,
orient="h",
order=[
"High income: OECD",
"High income: nonOECD",
"Upper middle income",
"Lower middle income",
"Low income",
],
palette=sns.color_palette('bright'),
)
plt.xlabel("Nombre de pays", fontsize=12)
plt.ylabel("Groupe de revenus", fontsize=12)
plt.legend(bbox_to_anchor=(1.05, 1), loc=0, borderaxespad=0.0, fontsize=12)
plt.title(
"Nombre de pays par groupe de revenus par région", fontsize=14, fontweight="bold"
)
plt.grid(False)
plt.savefig('assets/graphiques/EDA_3_Nombre de pays par groupe de revenu et par régions.jpg', bbox_inches='tight')
plt.show()
# Visualisation Regroupements des pays/régions/groupes de revenus pour avoir un visuel unique
fig = plt.figure(figsize=(15, 10))
grid = plt.GridSpec(2, 2, wspace=0.8, hspace=0.3)
# Graphique ligne 1, colonne 1
plt.subplot(grid[0, 0])
# Nombre de pays par région
df = (
country.groupby("Region")[["Short Name"]]
.count()["Short Name"]
.sort_values(ascending=True)
)
val = df.tolist()
plt.title("Nombre de pays par région", fontname="Arial", fontsize=14, fontweight="bold")
plt.barh(
y=country.groupby("Region")[["Short Name"]]
.count()
.reset_index()
.sort_values(by="Short Name")["Region"],
width=country.groupby("Region")[["Short Name"]]
.count()["Short Name"]
.sort_values(ascending=True),
)
# plt.grid(axis='x')
plt.text(val[6] - 5.5, 5.9, val[6], color="white", fontweight="bold")
plt.text(val[5] - 5.5, 4.9, val[5], color="white", fontweight="bold")
plt.text(val[4] - 5.5, 3.9, val[4], color="white", fontweight="bold")
plt.text(val[3] - 5.5, 2.9, val[3], color="white", fontweight="bold")
plt.text(val[2] - 5.5, 1.9, val[2], color="white", fontweight="bold")
plt.text(val[1] - 3.5, 0.9, val[1], color="white", fontweight="bold")
plt.text(val[0] - 2.5, -0.1, val[0], color="white", fontweight="bold")
# Graphique ligne 1, colonne 2
plt.subplot(grid[0, 1])
# Nombre de pays par groupe de revenus
plt.barh(
y=nb_pays_par_groupe_revenu["Income Group"],
width=country.groupby("Income Group")[["Short Name"]]
.count()["Short Name"]
.sort_values(),
)
plt.title("Nombre de pays par groupe de revenu", fontweight="bold")
for i in range(5):
plt.text(
80 - (nb_pays_par_groupe_revenu.iloc[i, 1]),
i - 0.1,
nb_pays_par_groupe_revenu.iloc[4 - i, 1],
color="white",
fontweight="bold",
fontsize=14,
)
# Graphique ligne 2, sur les 2 colonne
plt.subplot(grid[1, :])
# Regroupement des pays par groupe de revenus
sns.countplot(
y="Income Group",
hue="Region",
data=country,
orient="h",
order=[
"High income: OECD",
"High income: nonOECD",
"Upper middle income",
"Lower middle income",
"Low income",
],
palette=sns.color_palette("bright"),
)
plt.xlabel("Nombre de pays", fontsize=12)
plt.ylabel("Groupe de revenus", fontsize=12)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=12)
plt.title(
"Nombre de pays par groupe de revenus par région", fontsize=14, fontweight="bold"
)
plt.grid(False)
plt.savefig('assets/graphiques/EDA_4_Synthèse 3graph.jpg', bbox_inches='tight')
plt.show()
# Visualisation 1 visuel 4 graphiques
plt.figure(figsize=(15, 15), dpi=100) # taille
sns.set(style="dark")
plt.subplot(4, 2, 1)
plt.title("Distribution - analyse univariée")
df = (
country.groupby("Region")[["Short Name"]]
.count()["Short Name"]
.sort_values(ascending=True)
)
val = df.tolist()
# Graphique 1 : Nombre de pays par Région
plt.title("Nombre de pays par région", fontname="Arial", fontweight="bold", fontsize=14)
plt.barh(
y=country.groupby("Region")[["Short Name"]]
.count()
.reset_index()
.sort_values(by="Short Name")["Region"],
width=country.groupby("Region")[["Short Name"]]
.count()["Short Name"]
.sort_values(ascending=True),
)
# plt.grid(axis='x')
plt.text(val[6] - 5.5, 5.9, val[6], color="white", fontweight="bold")
plt.text(val[5] - 5.5, 4.9, val[5], color="white", fontweight="bold")
plt.text(val[4] - 5.5, 3.9, val[4], color="white", fontweight="bold")
plt.text(val[3] - 5.5, 2.9, val[3], color="white", fontweight="bold")
plt.text(val[2] - 5.5, 1.9, val[2], color="white", fontweight="bold")
plt.text(val[1] - 3.5, 0.9, val[1], color="white", fontweight="bold")
plt.text(val[0] - 2.5, -0.1, val[0], color="white", fontweight="bold")
# Nombre de pays par groupe de revenu
plt.subplot(4, 2, 2)
plt.barh(
y=nb_pays_par_groupe_revenu["Income Group"],
width=country.groupby("Income Group")[["Short Name"]]
.count()["Short Name"]
.sort_values(),
)
plt.title(
"Nombre de pays par groupe de revenu",
fontname="Arial",
fontweight="bold",
fontsize=14,
)
# On place les chiffres sur le graphe avec un boucle
for i in range(5):
plt.text(
80 - (nb_pays_par_groupe_revenu.iloc[i, 1]),
i - 0.1,
nb_pays_par_groupe_revenu.iloc[4 - i, 1],
color="white",
fontweight="bold",
)
# Monnaies les plus utilisées
plt.subplot(4, 2, 3)
plt.barh(
y=country.groupby("Currency Unit")[["Short Name"]]
.count()
.reset_index()
.sort_values(by="Short Name")["Currency Unit"]
.tail(8),
width=country.groupby("Currency Unit")[["Short Name"]]
.count()["Short Name"]
.sort_values()
.tail(8),
)
plt.title(
"Monnaies les plus utilisées", fontname="Arial", fontweight="bold", fontsize=14
)
plt.text(
21.5, 6.74, nb_pays_par_groupe_monnaie.iloc[0, 1], color="white", fontweight="bold"
)
plt.text(
12.5, 5.74, nb_pays_par_groupe_monnaie.iloc[1, 1], color="white", fontweight="bold"
)
plt.text(
12.5, 4.74, nb_pays_par_groupe_monnaie.iloc[2, 1], color="white", fontweight="bold"
)
plt.text(
5, 3.74, nb_pays_par_groupe_monnaie.iloc[3, 1], color="white", fontweight="bold"
)
plt.text(
2.2, 2.78, nb_pays_par_groupe_monnaie.iloc[4, 1], color="white", fontweight="bold"
)
plt.text(
2.2, 1.78, nb_pays_par_groupe_monnaie.iloc[5, 1], color="white", fontweight="bold"
)
plt.text(
2.2, 0.78, nb_pays_par_groupe_monnaie.iloc[6, 1], color="white", fontweight="bold"
)
plt.text(
1.2, -0.24, nb_pays_par_groupe_monnaie.iloc[7, 1], color="white", fontweight="bold"
)
# Regroupement des pays par groupe de revenus et par région
plt.subplot(4, 2, 4)
sns.countplot(
y="Income Group",
hue="Region",
data=country,
orient="h",
order=[
"High income: OECD",
"High income: nonOECD",
"Upper middle income",
"Lower middle income",
"Low income",
],
palette=sns.color_palette("bright"),
)
plt.xlabel("Nombre de pays", fontsize=12)
plt.ylabel("Groupe de revenus", fontsize=12)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=12)
plt.title(
"Nombre de pays par groupe de revenus par région",
fontname="Arial",
fontweight="bold",
fontsize=14,
)
plt.grid(False)
plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9, wspace=1, hspace=0.35)
# bbox_inches='tight'
plt.savefig('assets/graphiques/EDA_5_Synthèse 4graph.jpg', bbox_inches='tight')
plt.show()
**Bilan de : `EdStatsCountry.csv`** :
EdStatsCountry.csv contient des informations globales sur l'économie : niveau de vie, monnaie..., de chaque pays du monde ainsi qu'un classement par région de chaque pays.
Les informations concernent aussi des groupes de pays par région ou niveau de revenu (income)
Information complémentaire sur les données
# Import données
country_Series = pd.read_csv("assets/data/EdStatsCountry-Series.csv", sep=",")
# On affiche les 5 premières lignes
country_Series.head()
| CountryCode | SeriesCode | DESCRIPTION | Unnamed: 3 | |
|---|---|---|---|---|
| 0 | ABW | SP.POP.TOTL | Data sources : United Nations World Population... | NaN |
| 1 | ABW | SP.POP.GROW | Data sources: United Nations World Population ... | NaN |
| 2 | AFG | SP.POP.GROW | Data sources: United Nations World Population ... | NaN |
| 3 | AFG | NY.GDP.PCAP.PP.CD | Estimates are based on regression. | NaN |
| 4 | AFG | SP.POP.TOTL | Data sources : United Nations World Population... | NaN |
# Premières informations : nom colonnes, nb null, types
country_Series.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 613 entries, 0 to 612 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CountryCode 613 non-null object 1 SeriesCode 613 non-null object 2 DESCRIPTION 613 non-null object 3 Unnamed: 3 0 non-null float64 dtypes: float64(1), object(3) memory usage: 19.3+ KB
# Description des variables qualitatives (pas de variable quantitatives dans ce dataframe)
country_Series.describe(exclude=[np.number])
| CountryCode | SeriesCode | DESCRIPTION | |
|---|---|---|---|
| count | 613 | 613 | 613 |
| unique | 211 | 21 | 97 |
| top | MDA | SP.POP.TOTL | Data sources : United Nations World Population... |
| freq | 18 | 211 | 154 |
# Utilisation de la fonction def
info_data(country_Series)
------------------------------------------------------------------------------------------------------ LES DONNES Nombre de lignes : 613 lignes Nombre de colonnes : 4 colonnes Nombre d'éléments dupliqués dans le dataframe : 0 eléments Nombre total de données manquantes dans le dataframe : 613 données manquantes sur 2452 (25.0%)
info_data_na(country_Series)
Nombre total de données manquantes dans le dataframe : 613 données manquantes sur 2452 (25.0%) ------------------------------------------------------------------------------------------------------ Visualisation des données manquantes
country_Series["CountryCode"].nunique()
211
# On affiche les pays présents dans CountrySeries mais absent de Country
country[~country['Country Code'].isin(country_Series['CountryCode'].unique())][['Short Name']]
| Short Name | |
|---|---|
| 5 | Arab World |
| 47 | Curaçao |
| 57 | East Asia & Pacific (developing only) |
| 58 | East Asia & Pacific (all income levels) |
| 59 | Europe & Central Asia (developing only) |
| 60 | Europe & Central Asia (all income levels) |
| 63 | Euro area |
| 68 | European Union |
| 89 | High income |
| 92 | Heavily indebted poor countries (HIPC) |
| 116 | Latin America & Caribbean (developing only) |
| 122 | Latin America & Caribbean (all income levels) |
| 123 | Least developed countries: UN classification |
| 124 | Low income |
| 127 | Lower middle income |
| 128 | Low & middle income |
| 134 | St. Martin (French part) |
| 140 | Middle East & North Africa (all income levels) |
| 143 | Middle income |
| 148 | Middle East & North Africa (developing only) |
| 157 | North America |
| 166 | Nauru |
| 168 | OECD members |
| 187 | South Asia |
| 198 | Sub-Saharan Africa (developing only) |
| 199 | South Sudan |
| 200 | Sub-Saharan Africa (all income levels) |
| 207 | Sint Maarten (Dutch part) |
| 225 | Upper middle income |
| 234 | World |
Ce sont des zones, des sudivisions de pays et des données relatrives aux revenus
country_Series["SeriesCode"].nunique()
21
country_Series["SeriesCode"].unique()
array(['SP.POP.TOTL', 'SP.POP.GROW', 'NY.GDP.PCAP.PP.CD',
'NY.GDP.MKTP.PP.KD', 'NY.GNP.MKTP.PP.CD', 'NY.GDP.MKTP.PP.CD',
'NY.GDP.PCAP.PP.KD', 'NY.GNP.PCAP.PP.CD', 'SP.POP.1564.TO.ZS',
'SP.POP.TOTL.MA.ZS', 'SP.POP.TOTL.FE.ZS', 'SP.POP.0014.TO.ZS',
'NY.GNP.PCAP.CD', 'NY.GDP.PCAP.CD', 'NY.GDP.PCAP.KD',
'SP.POP.1564.MA.IN', 'SP.POP.0014.TO', 'SP.POP.1564.TO',
'SP.POP.1564.FE.IN', 'SP.POP.0014.MA.IN', 'SP.POP.0014.FE.IN'],
dtype=object)
country_Series["DESCRIPTION"].unique()
array(['Data sources : United Nations World Population Prospects',
'Data sources: United Nations World Population Prospects',
'Estimates are based on regression.',
'Data sources : Institute of Statistics, Eurostat',
'Data sources : Australian Bureau of Statistics',
'Including Other Territories comprising Jervis Bay Territory, Christmas Island and the Cocos (Keeling) Islands. ',
'Data sources: Australian Bureau of Statistics',
'Data sources : Eurostat',
'Data sources : Eurostat, State Statistical Committee, United Nations World Population Prospects',
'Data sources: Eurostat, State Statistical Committee, United Nations World Population Prospects',
'Data sources : National Statistical Committee of the Republic of Belarus',
'Data sources: National Statistical Committee of the Republic of Belarus',
'Data sources : Bermuda Government Department of Statistics',
'Data sources: Bermuda Government Department of Statistics',
'Data sources : Statistics Canada',
'Data sources: Statistics Canada',
'Refers to Guernsey, and Jersey.',
'Data sources : National Bureau of Statistics',
'Excluding Hong Kong SAR, Macao SAR, Taiwan.',
'Data sources: National Bureau of Statistics',
'Data are for areas under the effective control of the Government of the Republic of Cyprus.',
'Including Northern-Cyprus.',
'Data sources: Eurostat, United Nations World Population Prospects. Including the French overseas departments of French Guiana, Guadeloupe, Martinique, Mayotte, and Réunion.',
'Including the four French overseas departments (French Guyana, Guadeloupe, Martinique and Réunion)',
'Data sources : Eurostat, United Nations World Population Prospects. Including the French overseas departments of French Guiana, Guadeloupe, Martinique, Mayotte, and Réunion.',
'Data sources: Eurostat',
'Data sources : GeoStats. GeoStats. Adjusted to mid-year population using geometric mean for the years where mid-year population figures are not available. Excluding Abkhazia and South Ossetia.',
'Including Abkhazia and South Ossetia.',
'Excludes Abkhazia and South Ossetia.',
'Derived using ratio of age group from WPP and World Bank population.',
'Data sources: GeoStats. GeoStats. Adjusted to mid-year population using geometric mean for the years where mid-year population figures are not available. Excluding Abkhazia and South Ossetia.',
'Excluding Abkhazia and South Ossetia.',
'Data sources: Statistics Greenland',
'Data sources : Statistics Greenland',
'Data sources: Census and Statistics Department',
'Data sources : Census and Statistics Department',
'Data sources: Croatian Bureau of Statistics, Council of Europe',
'Data sources : Croatian Bureau of Statistics, Council of Europe',
'Data sources: Central Bureau of Statistics',
'Data sources : Central Bureau of Statistics',
'Data sources: Ministry of Internal Affairs and Communications, Statistics Bureau; United Nations Population and Vital Statistics Report',
'Data sources : Ministry of Internal Affairs and Communications, Statistics Bureau; United Nations Population and Vital Statistics Report',
'Data sources : Statistical Agency of Kazakhstan, 1979 Census, 1989 Census, United Nations World Population Prospects',
'Data sources: Statistical Agency of Kazakhstan, 1979 Census, 1989 Census, United Nations World Population Prospects',
'Data sources : National Statistical Committee, United Nations World Population Prospects',
'Data sources: National Statistical Committee, United Nations World Population Prospects',
'Data sources : Korean Statistical Information Service',
'Data sources: Korean Statistical Information Service',
"Data sources : Registrar General's Department, Department of Census and Statistics",
"Data sources: Registrar General's Department, Department of Census and Statistics",
'Includes Former Spanish Sahara.',
'Morocco only. (Excluding former Spanish Sahara)',
"Data sources : National Bureau of Statistics. Excluding Transnistria. For 1950-94, World Bank estimates using UN World Population Prospects' growth rates of whole Moldova.",
'Excludes Transnistria.', 'Including Transnistria.',
"Data sources: National Bureau of Statistics. Excluding Transnistria. For 1950-94, World Bank estimates using UN World Population Prospects 2010 Revision's growth rates of whole Moldova.",
'Excluding Transnistria.',
'Data sources : For 1960-1989, United Nations World Population Prospects. For 1990 onwards, 1990, 1995, 2000, 2006, 2014 censuses, and data on expatriate employment from Maldives National Bureau of Statistics. The census data are adjusted for mid-year and for foreign population.',
'Data sources: For 1960-1989, United Nations World Population Prospects. For 1990 onwards, 1990, 1995, 2000, 2006, 2014 censuses, and data on expatriate employment from Maldives National Bureau of Statistics. The census data are adjusted for mid-year and for foreign population.',
'Data sources : For years after 2000, Statistical Office of Montenegro. For years 1960-1999, growth rates of United Nations World Population Prospects were used to estimate backwards.',
'Data sources : Statistics Mauritius, United Nations World Population Prospects',
'Including Agalega and St. Brandon.',
'Data sources: Central Statistics Office, United Nations Population and Vital Statistics Report, United Nations World Population Prospects',
'Excluding Agalega and St. Brandon.',
'Including Sabah and Sarawak.',
'Data sources : Institute of Statistics and Economic Studies',
'Data sources: Institute of Statistics and Economic Studies, 1983 Census, 1989 Census, 1996 Census',
'Data sources: Statistics New Zealand',
'Data sources : Statistics New Zealand',
'Data sources : U.S. Census Bureau, 1990 Census',
'Data sources: U.S. Census Bureau, 1990 Census',
'Data sources : World Bank estimates based on data from Palestinian Central Bureau of Statistics, excluding E. Jerusalem.',
'Excluding East Jerusalem.',
'Data sources: World Bank estimates based on data from Palestinian Central Bureau of Statistics, excluding E. Jerusalem.',
'Data sources : Russian Federation Federal State Statistics Service (its January 1st populations were adjusted to mid-year populations by using geometric mean), 1979 Census, 1989 Census.',
'Data sources : United Nations World Population Prospects. Estimates are for Sudan excluding South Sudan.',
'Data sources: Department of Statistics Singapore',
'Data sources : Department of Statistics Singapore',
'Data sources: Statistical Office of the Republic of Serbia, Eurostat, TransMONEE, Council of Europe. Excluding Kosovo for all years 1950-2011',
'Including Kosovo.',
'Data sources : Statistical Office of the Republic of Serbia, Eurostat, TransMONEE, Council of Europe. Note : Excluding Kosovo.',
'Excluding Kosovo.', 'Covers mainland Tanzania only.',
'Including Zanzibar.',
'Data sources : State Statistics Committee of Ukraine, United Nations World Population Prospects, Council of Europe, United Nations Population and Vital Statistics Report',
'Data sources: State Statistics Committee of Ukraine, United Nations World Population Prospects, Council of Europe, United Nations Population and Vital Statistics Report',
'Data sources : U.S. Census Bureau',
'Data sources: U.S. Census Bureau',
'Data sources : State Committee of the Republic of Uzbekistan on Statistics, United Nations World Population Prospects',
'Based on official exchange rates.',
'Data sources : U.S. Census Bureau, United Nations Population and Vital Statistics Report',
'Data sources: U.S. Census Bureau, United Nations Population and Vital Statistics Report',
'Data sources: General Statistics Office of Vietnam',
'Data sources : General Statistics Office of Vietnam',
"Data sources : World Bank's Kosovo coutnry office, Kosovo Agency of Statistics, U.S. Census Bureau",
"Data sources: World Bank's Kosovo coutnry office, Kosovo Agency of Statistics, U.S. Census Bureau",
'Data sources : Statistics South Africa, United Nations World Population Prospects'],
dtype=object)
Series_code = country_Series.groupby('SeriesCode')[['CountryCode']].count().reset_index().sort_values(by='CountryCode')['SeriesCode']
nb_pays_serie_code=country_Series.groupby('SeriesCode')[['CountryCode']].count()['CountryCode'].sort_values(ascending=True)
# Graphique : Nombre de pays par indicateurs
plt.barh(Series_code,nb_pays_serie_code)
plt.title("Repartition du Nombre de pays par indicateur",weight='bold')
for i, var in enumerate(nb_pays_serie_code):
plt.text(var+1,i-0.5,str(var),color='steelblue',fontweight='bold')
plt.savefig('assets/graphiques/EDA_6_Répartition pays par indicateurs.jpg', bbox_inches='tight')
# Graphique : Répartition des 10 sources de données les plus utilisées
description = country_Series.groupby('DESCRIPTION')[['CountryCode']].count().reset_index().sort_values(by='CountryCode')['DESCRIPTION'].tail(10)
nb_pays_description = country_Series.groupby('DESCRIPTION')[['CountryCode']].count()['CountryCode'].sort_values().tail(10)
plt.barh(description,nb_pays_description)
plt.title("Repartition du Nombre de données par nombre de pays par sources",weight='bold')
for i, var in enumerate(nb_pays_description):
plt.text(var+1,i-0.2,str(var),color='steelblue',fontweight='bold')
plt.savefig('assets/graphiques/EDA_7_Répartition source pays.jpg', bbox_inches='tight')
# Création d'un visuel unique concernant les indicateurs et les descriptions
fig, ax = plt.subplots(1, 2, figsize=(20, 7))
# Regroupement des pays par code indicateur
ax1=ax[0]
ax1.barh(y = Series_code, width=nb_pays_serie_code, color='SteelBlue')
for i, v in enumerate(nb_pays_serie_code):
ax1.text(v + 3, i - .22, str(v), color='SteelBlue', fontsize=10, fontweight='bold')
ax1.set_title('Nombre de pays par code indicateur', fontsize=12,fontweight='bold')
ax1.set_xlabel('Nombre de pays', fontsize=10)
ax1.set_ylabel('Codes indicateurs', fontsize=10)
ax1.tick_params(labelsize=10)
ax1.set_xlim([0, 250])
# ax1.grid(True)
# Graphique 2 : Top 10 descriptions (sources)
ax2=ax[1]
ax2.barh(y = description, width=nb_pays_description, color='SteelBlue')
for i, v in enumerate(nb_pays_description):
ax2.text(v + 3, i + .01, str(v), color='SteelBlue', fontsize=10, fontweight='bold')
ax2.set_title('Nombre de pays pour les tops 10 descriptions', fontsize=12,fontweight='bold')
ax2.set_xlabel('Nombre de pays', fontsize=10)
ax2.set_ylabel('Sources des codes indicateurs', fontsize=10)
ax2.tick_params(labelsize=10)
ax2.set_xlim([0, 180])
# ax2.grid(True)
plt.savefig('assets/graphiques/EDA_8_Synthèse 2graph répartition indicatreur et sources.jpg', bbox_inches='tight')
fig.tight_layout()
plt.show()
**Bilan de : `EdStatsCountry-Series.csv`** :
Le dataset nous fournit les descriptions des indicateurs contenus dans EdStatsData.csv
# Import données
footNote = pd.read_csv("assets/data/EdStatsFootNote.csv", sep=",")
footNote.head()
| CountryCode | SeriesCode | Year | DESCRIPTION | Unnamed: 4 | |
|---|---|---|---|---|---|
| 0 | ABW | SE.PRE.ENRL.FE | YR2001 | Country estimation. | NaN |
| 1 | ABW | SE.TER.TCHR.FE | YR2005 | Country estimation. | NaN |
| 2 | ABW | SE.PRE.TCHR.FE | YR2000 | Country estimation. | NaN |
| 3 | ABW | SE.SEC.ENRL.GC | YR2004 | Country estimation. | NaN |
| 4 | ABW | SE.PRE.TCHR | YR2006 | Country estimation. | NaN |
footNote.columns
Index(['CountryCode', 'SeriesCode', 'Year', 'DESCRIPTION', 'Unnamed: 4'], dtype='object')
info_data(footNote)
------------------------------------------------------------------------------------------------------ LES DONNES Nombre de lignes : 643638 lignes Nombre de colonnes : 5 colonnes Nombre d'éléments dupliqués dans le dataframe : 0 eléments Nombre total de données manquantes dans le dataframe : 643638 données manquantes sur 3218190 (20.0%)
footNote.dtypes
CountryCode object SeriesCode object Year object DESCRIPTION object Unnamed: 4 float64 dtype: object
# Résumé des variables qualitatives
country.describe(exclude=[np.number])
| Country Code | Short Name | Table Name | Long Name | 2-alpha code | Currency Unit | Special Notes | Region | Income Group | WB-2 code | ... | External debt Reporting status | System of trade | Government Accounting concept | IMF data dissemination standard | Latest population census | Latest household survey | Source of most recent Income and expenditure data | Vital registration complete | Latest agricultural census | Latest water withdrawal data | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 241 | 241 | 241 | 241 | 238 | 215 | 145 | 214 | 214 | 240 | ... | 124 | 200 | 161 | 181 | 213 | 141 | 160 | 111 | 142 | 179 |
| unique | 241 | 241 | 241 | 241 | 238 | 152 | 131 | 7 | 5 | 240 | ... | 3 | 2 | 2 | 2 | 27 | 60 | 75 | 2 | 35 | 20 |
| top | ABW | Aruba | Aruba | Aruba | AW | Euro | April 2012 database update: Based on official ... | Europe & Central Asia | Upper middle income | AW | ... | Actual | General trade system | Consolidated central government | General Data Dissemination System (GDDS) | 2011 | World Health Survey (WHS), 2003 | Integrated household survey (IHS), 2012 | Yes | 2010 | 2000 |
| freq | 1 | 1 | 1 | 1 | 1 | 23 | 6 | 57 | 55 | 1 | ... | 107 | 106 | 95 | 110 | 59 | 10 | 15 | 110 | 36 | 40 |
4 rows × 28 columns
info_data_na(footNote)
Nombre total de données manquantes dans le dataframe : 643638 données manquantes sur 3218190 (20.0%) ------------------------------------------------------------------------------------------------------ Visualisation des données manquantes
# Liste des noms des pays présents dans Country et qui ne sont pas présents dans le jeu de données EdStatsCountry-Series.csv
country[~country["Country Code"].isin(country_Series["CountryCode"])][
["Short Name"]
].sort_values(by="Short Name")
| Short Name | |
|---|---|
| 5 | Arab World |
| 47 | Curaçao |
| 58 | East Asia & Pacific (all income levels) |
| 57 | East Asia & Pacific (developing only) |
| 63 | Euro area |
| 60 | Europe & Central Asia (all income levels) |
| 59 | Europe & Central Asia (developing only) |
| 68 | European Union |
| 92 | Heavily indebted poor countries (HIPC) |
| 89 | High income |
| 122 | Latin America & Caribbean (all income levels) |
| 116 | Latin America & Caribbean (developing only) |
| 123 | Least developed countries: UN classification |
| 128 | Low & middle income |
| 124 | Low income |
| 127 | Lower middle income |
| 140 | Middle East & North Africa (all income levels) |
| 148 | Middle East & North Africa (developing only) |
| 143 | Middle income |
| 166 | Nauru |
| 157 | North America |
| 168 | OECD members |
| 207 | Sint Maarten (Dutch part) |
| 187 | South Asia |
| 199 | South Sudan |
| 134 | St. Martin (French part) |
| 200 | Sub-Saharan Africa (all income levels) |
| 198 | Sub-Saharan Africa (developing only) |
| 225 | Upper middle income |
| 234 | World |
list_description = footNote['DESCRIPTION'].unique().tolist()
footnote donne des Informations sur année d'origine des données et la description des données. La colonne Year n'est pas utilisable, il faut la traiter :
# On affiche les années
footNote['Year'].unique()
array(['YR2001', 'YR2005', 'YR2000', 'YR2004', 'YR2006', 'YR2003',
'YR1999', 'YR2008', 'YR2002', 'YR2007', 'YR2017', 'YR2010',
'YR2009', 'YR2011', 'YR2012', 'YR2015', 'YR2014', 'YR1970',
'YR1971', 'YR1972', 'YR1973', 'YR1974', 'YR1975', 'YR1976',
'YR1977', 'YR1978', 'YR1979', 'YR1980', 'YR1981', 'YR1982',
'YR1983', 'YR1984', 'YR1985', 'YR1986', 'YR1987', 'YR1988',
'YR1989', 'YR1990', 'YR1991', 'YR1992', 'YR1993', 'YR1994',
'YR1995', 'YR1996', 'YR1997', 'YR1998', 'YR2013', 'YR2016',
'YR2020', 'YR2025', 'YR2030', 'YR2035', 'YR2040', 'YR2045',
'YR2050', 'yr2012'], dtype=object)
# On supprime les 2 premières lettres pour avoir uniquement l'année
footNote['Year']=footNote['Year'].apply(lambda x : x[2:])
footNote['Year'].unique()
array(['2001', '2005', '2000', '2004', '2006', '2003', '1999', '2008',
'2002', '2007', '2017', '2010', '2009', '2011', '2012', '2015',
'2014', '1970', '1971', '1972', '1973', '1974', '1975', '1976',
'1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984',
'1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992',
'1993', '1994', '1995', '1996', '1997', '1998', '2013', '2016',
'2020', '2025', '2030', '2035', '2040', '2045', '2050'],
dtype=object)
# On affiche les données en fonction des années
footNote.groupby('Year').count()
| CountryCode | SeriesCode | DESCRIPTION | Unnamed: 4 | |
|---|---|---|---|---|
| Year | ||||
| 1970 | 5374 | 5374 | 5374 | 0 |
| 1971 | 8507 | 8507 | 8507 | 0 |
| 1972 | 8661 | 8661 | 8661 | 0 |
| 1973 | 8571 | 8571 | 8571 | 0 |
| 1974 | 8512 | 8512 | 8512 | 0 |
| 1975 | 8627 | 8627 | 8627 | 0 |
| 1976 | 8729 | 8729 | 8729 | 0 |
| 1977 | 8593 | 8593 | 8593 | 0 |
| 1978 | 8585 | 8585 | 8585 | 0 |
| 1979 | 8742 | 8742 | 8742 | 0 |
| 1980 | 8950 | 8950 | 8950 | 0 |
| 1981 | 9465 | 9465 | 9465 | 0 |
| 1982 | 9014 | 9014 | 9014 | 0 |
| 1983 | 9227 | 9227 | 9227 | 0 |
| 1984 | 9386 | 9386 | 9386 | 0 |
| 1985 | 9653 | 9653 | 9653 | 0 |
| 1986 | 9743 | 9743 | 9743 | 0 |
| 1987 | 9357 | 9357 | 9357 | 0 |
| 1988 | 9382 | 9382 | 9382 | 0 |
| 1989 | 9517 | 9517 | 9517 | 0 |
| 1990 | 10616 | 10616 | 10616 | 0 |
| 1991 | 11129 | 11129 | 11129 | 0 |
| 1992 | 11309 | 11309 | 11309 | 0 |
| 1993 | 11093 | 11093 | 11093 | 0 |
| 1994 | 11448 | 11448 | 11448 | 0 |
| 1995 | 11194 | 11194 | 11194 | 0 |
| 1996 | 11197 | 11197 | 11197 | 0 |
| 1997 | 11167 | 11167 | 11167 | 0 |
| 1998 | 14916 | 14916 | 14916 | 0 |
| 1999 | 24938 | 24938 | 24938 | 0 |
| 2000 | 25093 | 25093 | 25093 | 0 |
| 2001 | 25047 | 25047 | 25047 | 0 |
| 2002 | 25687 | 25687 | 25687 | 0 |
| 2003 | 25683 | 25683 | 25683 | 0 |
| 2004 | 27128 | 27128 | 27128 | 0 |
| 2005 | 25992 | 25992 | 25992 | 0 |
| 2006 | 23394 | 23394 | 23394 | 0 |
| 2007 | 24263 | 24263 | 24263 | 0 |
| 2008 | 25018 | 25018 | 25018 | 0 |
| 2009 | 22965 | 22965 | 22965 | 0 |
| 2010 | 22598 | 22598 | 22598 | 0 |
| 2011 | 17760 | 17760 | 17760 | 0 |
| 2012 | 13904 | 13904 | 13904 | 0 |
| 2013 | 13405 | 13405 | 13405 | 0 |
| 2014 | 11378 | 11378 | 11378 | 0 |
| 2015 | 6348 | 6348 | 6348 | 0 |
| 2016 | 819 | 819 | 819 | 0 |
| 2017 | 558 | 558 | 558 | 0 |
| 2020 | 140 | 140 | 140 | 0 |
| 2025 | 140 | 140 | 140 | 0 |
| 2030 | 140 | 140 | 140 | 0 |
| 2035 | 144 | 144 | 144 | 0 |
| 2040 | 144 | 144 | 144 | 0 |
| 2045 | 144 | 144 | 144 | 0 |
| 2050 | 144 | 144 | 144 | 0 |
footNote['Year'].sort_values().unique()
array(['1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
'1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985',
'1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993',
'1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001',
'2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009',
'2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017',
'2020', '2025', '2030', '2035', '2040', '2045', '2050'],
dtype=object)
# Visualiser la répartion du nombre d'indicateurs par pays et année
plt.figure(figsize=(15,5))
plt.title("Nombres d'indicateurs par pays et années",fontweight='bold')
sns.barplot(data=footNote.groupby('Year').count().reset_index(),y='CountryCode',x='Year')
plt.xticks(rotation=90)
plt.savefig("assets/graphiques/EDA_9_Nombres d'indicateurs par pays et années.jpg", bbox_inches='tight')
plt.show()
**Bilan de : `EdStatsFootNote.csv`** :
# Import données
serie = pd.read_csv("assets/data/EdStatsSeries.csv", sep=",")
serie.head()
| Series Code | Topic | Indicator Name | Short definition | Long definition | Unit of measure | Periodicity | Base Period | Other notes | Aggregation method | ... | Notes from original source | General comments | Source | Statistical concept and methodology | Development relevance | Related source links | Other web links | Related indicators | License Type | Unnamed: 20 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BAR.NOED.1519.FE.ZS | Attainment | Barro-Lee: Percentage of female population age... | Percentage of female population age 15-19 with... | Percentage of female population age 15-19 with... | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | Robert J. Barro and Jong-Wha Lee: http://www.b... | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | BAR.NOED.1519.ZS | Attainment | Barro-Lee: Percentage of population age 15-19 ... | Percentage of population age 15-19 with no edu... | Percentage of population age 15-19 with no edu... | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | Robert J. Barro and Jong-Wha Lee: http://www.b... | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | BAR.NOED.15UP.FE.ZS | Attainment | Barro-Lee: Percentage of female population age... | Percentage of female population age 15+ with n... | Percentage of female population age 15+ with n... | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | Robert J. Barro and Jong-Wha Lee: http://www.b... | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | BAR.NOED.15UP.ZS | Attainment | Barro-Lee: Percentage of population age 15+ wi... | Percentage of population age 15+ with no educa... | Percentage of population age 15+ with no educa... | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | Robert J. Barro and Jong-Wha Lee: http://www.b... | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | BAR.NOED.2024.FE.ZS | Attainment | Barro-Lee: Percentage of female population age... | Percentage of female population age 20-24 with... | Percentage of female population age 20-24 with... | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | Robert J. Barro and Jong-Wha Lee: http://www.b... | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 21 columns
info_data(serie)
------------------------------------------------------------------------------------------------------ LES DONNES Nombre de lignes : 3665 lignes Nombre de colonnes : 21 colonnes Nombre d'éléments dupliqués dans le dataframe : 0 eléments Nombre total de données manquantes dans le dataframe : 55203 données manquantes sur 76965 (71.72%)
serie.dtypes
Series Code object Topic object Indicator Name object Short definition object Long definition object Unit of measure float64 Periodicity object Base Period object Other notes object Aggregation method object Limitations and exceptions object Notes from original source float64 General comments object Source object Statistical concept and methodology object Development relevance object Related source links object Other web links float64 Related indicators float64 License Type float64 Unnamed: 20 float64 dtype: object
info_data_na(serie)
Nombre total de données manquantes dans le dataframe : 55203 données manquantes sur 76965 (71.72%) ------------------------------------------------------------------------------------------------------ Visualisation des données manquantes
# On affiche le pourcentage de données manquantes par colonne classée dans l'ordre croissant
na_serie = serie.isnull().mean()*100
na_serie.sort_values()
Series Code 0.000000 Topic 0.000000 Indicator Name 0.000000 Long definition 0.000000 Source 0.000000 Short definition 41.173261 Other notes 84.938608 Base Period 91.432469 Related source links 94.133697 Periodicity 97.298772 Aggregation method 98.717599 Statistical concept and methodology 99.372442 Limitations and exceptions 99.618008 General comments 99.618008 Development relevance 99.918145 Notes from original source 100.000000 License Type 100.000000 Unit of measure 100.000000 Other web links 100.000000 Related indicators 100.000000 Unnamed: 20 100.000000 dtype: float64
serie['Series Code'].head()
0 BAR.NOED.1519.FE.ZS 1 BAR.NOED.1519.ZS 2 BAR.NOED.15UP.FE.ZS 3 BAR.NOED.15UP.ZS 4 BAR.NOED.2024.FE.ZS Name: Series Code, dtype: object
serie['Series Code'].nunique()
3665
# Afficher 5 dernieres valeurs
serie['Topic'].tail()
3660 Expenditures 3661 Expenditures 3662 Expenditures 3663 Expenditures 3664 Expenditures Name: Topic, dtype: object
# Lister les items
list_item = serie['Topic'].unique()
# Visualiser les 20 premiers items
list_item[:20]
array(['Attainment', 'Education Equality',
'Infrastructure: Communications', 'Learning Outcomes',
'Economic Policy & Debt: National accounts: US$ at current prices: Aggregate indicators',
'Economic Policy & Debt: National accounts: US$ at constant 2010 prices: Aggregate indicators',
'Economic Policy & Debt: Purchasing power parity',
'Economic Policy & Debt: National accounts: Atlas GNI & GNI per capita',
'Teachers', 'Education Management Information Systems (SABER)',
'Early Child Development (SABER)',
'Engaging the Private Sector (SABER)',
'School Health and School Feeding (SABER)',
'School Autonomy and Accountability (SABER)',
'School Finance (SABER)', 'Student Assessment (SABER)',
'Teachers (SABER)', 'Tertiary Education (SABER)',
'Workforce Development (SABER)', 'Literacy'], dtype=object)
serie['Topic'].nunique()
37
serie['Indicator Name'].nunique()
3665
list_indicator = serie['Indicator Name'].unique().tolist()
list_indicator[350:400]
['Barro-Lee: Average years of tertiary schooling, age 55-59, total', 'Barro-Lee: Average years of tertiary schooling, age 55-59, female', 'Barro-Lee: Average years of tertiary schooling, age 60-64, total', 'Barro-Lee: Average years of tertiary schooling, age 60-64, female', 'Barro-Lee: Average years of tertiary schooling, age 65-69, total', 'Barro-Lee: Average years of tertiary schooling, age 65-69, female', 'Barro-Lee: Average years of tertiary schooling, age 70-74, total', 'Barro-Lee: Average years of tertiary schooling, age 70-74, female', 'Barro-Lee: Average years of tertiary schooling, age 75+, total', 'Barro-Lee: Average years of tertiary schooling, age 75+, female', 'DHS: Gross attendance rate. Post Secondary', 'DHS: Gross attendance rate. Post Secondary. Female', 'DHS: Gross attendance rate. Post Secondary. Male', 'DHS: Gross attendance rate. Post Secondary. Quintile 1', 'DHS: Gross attendance rate. Post Secondary. Quintile 2', 'DHS: Gross attendance rate. Post Secondary. Quintile 3', 'DHS: Gross attendance rate. Post Secondary. Quintile 4', 'DHS: Gross attendance rate. Post Secondary. Quintile 5', 'DHS: Gross attendance rate. Post Secondary. Rural', 'DHS: Gross attendance rate. Post Secondary. Urban', 'DHS: Net attendance rate. Primary', 'DHS: Net attendance rate. Primary. Female', 'DHS: Net attendance rate. Primary. Male', 'DHS: Net attendance rate. Primary. Quintile 1', 'DHS: Net attendance rate. Primary. Quintile 2', 'DHS: Net attendance rate. Primary. Quintile 3', 'DHS: Net attendance rate. Primary. Quintile 4', 'DHS: Net attendance rate. Primary. Quintile 5', 'DHS: Net attendance rate. Primary. Rural', 'DHS: Net attendance rate. Primary. Urban', 'DHS: Net attendance rate. Secondary', 'DHS: Net attendance rate. Secondary. Female', 'DHS: Net attendance rate. Secondary. Male', 'DHS: Net attendance rate. Secondary. Quintile 1', 'DHS: Net attendance rate. Secondary. Quintile 2', 'DHS: Net attendance rate. Secondary. Quintile 3', 'DHS: Net attendance rate. Secondary. Quintile 4', 'DHS: Net attendance rate. Secondary. Quintile 5', 'DHS: Net attendance rate. Secondary. Rural', 'DHS: Net attendance rate. Secondary. Urban', 'DHS: Net intake rate for the first grade of primary education', 'DHS: Net intake rate for the first grade of primary education. Female', 'DHS: Net intake rate for the first grade of primary education. Male', 'DHS: Net intake rate for the first grade of primary education. Quintile 1', 'DHS: Net intake rate for the first grade of primary education. Quintile 2', 'DHS: Net intake rate for the first grade of primary education. Quintile 3', 'DHS: Net intake rate for the first grade of primary education. Quintile 4', 'DHS: Net intake rate for the first grade of primary education. Quintile 5', 'DHS: Net intake rate for the first grade of primary education. Rural', 'DHS: Net intake rate for the first grade of primary education. Urban']
# Nombre de variable dans serie
serie['Long definition'].count()
3665
# On affiche les définitions longues
serie['Long definition'].head(2)
0 Percentage of female population age 15-19 with... 1 Percentage of population age 15-19 with no edu... Name: Long definition, dtype: object
# On affiche les définitions longues
serie['Short definition'].count()
2156
# Nombres d'indicateurs pour les 10 items les plus renseignés
items = serie.groupby('Topic')['Series Code'].count().reset_index().sort_values(by="Series Code",ascending=False)
items
| Topic | Series Code | |
|---|---|---|
| 19 | Learning Outcomes | 1046 |
| 0 | Attainment | 733 |
| 9 | Education Equality | 426 |
| 28 | Secondary | 256 |
| 24 | Primary | 248 |
| 21 | Population | 213 |
| 34 | Tertiary | 158 |
| 32 | Teachers | 137 |
| 12 | Expenditures | 93 |
| 11 | Engaging the Private Sector (SABER) | 51 |
| 20 | Literacy | 32 |
| 33 | Teachers (SABER) | 29 |
| 25 | School Autonomy and Accountability (SABER) | 29 |
| 4 | Early Childhood Education | 24 |
| 10 | Education Management Information Systems (SABER) | 23 |
| 22 | Post-Secondary/Non-Tertiary | 19 |
| 26 | School Finance (SABER) | 18 |
| 35 | Tertiary Education (SABER) | 17 |
| 31 | Student Assessment (SABER) | 15 |
| 23 | Pre-Primary | 14 |
| 15 | Health: Population: Structure | 13 |
| 36 | Workforce Development (SABER) | 12 |
| 3 | Early Child Development (SABER) | 12 |
| 29 | Social Protection & Labor: Labor force structure | 11 |
| 27 | School Health and School Feeding (SABER) | 9 |
| 8 | Economic Policy & Debt: Purchasing power parity | 6 |
| 30 | Social Protection & Labor: Unemployment | 3 |
| 7 | Economic Policy & Debt: National accounts: US$... | 3 |
| 18 | Laber | 3 |
| 1 | Background | 2 |
| 17 | Infrastructure: Communications | 2 |
| 6 | Economic Policy & Debt: National accounts: US$... | 2 |
| 2 | EMIS | 2 |
| 16 | Health: Risk factors | 1 |
| 14 | Health: Population: Dynamics | 1 |
| 13 | Health: Mortality | 1 |
| 5 | Economic Policy & Debt: National accounts: Atl... | 1 |
**Bilan de `EdStatsSeries.csv`** :
# Import données
data = pd.read_csv("assets/data/EdStatsData.csv", sep=",")
data.head()
| Country Name | Country Code | Indicator Name | Indicator Code | 1970 | 1971 | 1972 | 1973 | 1974 | 1975 | ... | 2060 | 2065 | 2070 | 2075 | 2080 | 2085 | 2090 | 2095 | 2100 | Unnamed: 69 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Arab World | ARB | Adjusted net enrolment rate, lower secondary, ... | UIS.NERA.2 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | Arab World | ARB | Adjusted net enrolment rate, lower secondary, ... | UIS.NERA.2.F | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | Arab World | ARB | Adjusted net enrolment rate, lower secondary, ... | UIS.NERA.2.GPI | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | Arab World | ARB | Adjusted net enrolment rate, lower secondary, ... | UIS.NERA.2.M | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | Arab World | ARB | Adjusted net enrolment rate, primary, both sex... | SE.PRM.TENR | 54.822121 | 54.894138 | 56.209438 | 57.267109 | 57.991138 | 59.36554 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 70 columns
# Nombre de type de variable
values = data.dtypes.value_counts()
nb_tt = values.sum()
percent = round(values / nb_tt*100,2)
# print(values)
# print(nb_tt)
tab_type = pd.concat([values, percent], axis=1)
tab_type.columns = ['Nombre par type de variable'
, '% des types de variable'] # création des colonnes
display(tab_type[tab_type['Nombre par type de variable'] != 0]
.sort_values('% des types de variable', ascending = False)# trier les valeurs par ordre decroissant
.style.background_gradient("Blues")) # choix couleurs radial
| Nombre par type de variable | % des types de variable | |
|---|---|---|
| float64 | 66 | 94.290000 |
| object | 4 | 5.710000 |
plt.pie(x = data.dtypes.value_counts(),
labels = data.dtypes.value_counts().index,
)
plt.title('Répartion des types de variables',fontweight='bold')
plt.show()
# Résumé des variables qualitatives
data.describe(exclude=[np.number])
| Country Name | Country Code | Indicator Name | Indicator Code | |
|---|---|---|---|---|
| count | 886930 | 886930 | 886930 | 886930 |
| unique | 242 | 242 | 3665 | 3665 |
| top | Arab World | ARB | Adjusted net enrolment rate, lower secondary, ... | UIS.NERA.2 |
| freq | 3665 | 3665 | 242 | 242 |
# Résumé des variables quantitatives
data.describe()
| 1970 | 1971 | 1972 | 1973 | 1974 | 1975 | 1976 | 1977 | 1978 | 1979 | ... | 2060 | 2065 | 2070 | 2075 | 2080 | 2085 | 2090 | 2095 | 2100 | Unnamed: 69 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 7.228800e+04 | 3.553700e+04 | 3.561900e+04 | 3.554500e+04 | 3.573000e+04 | 8.730600e+04 | 3.748300e+04 | 3.757400e+04 | 3.757600e+04 | 3.680900e+04 | ... | 5.143600e+04 | 5.143600e+04 | 5.143600e+04 | 5.143600e+04 | 5.143600e+04 | 5.143600e+04 | 5.143600e+04 | 5.143600e+04 | 5.143600e+04 | 0.0 |
| mean | 1.974772e+09 | 4.253638e+09 | 4.592365e+09 | 5.105006e+09 | 5.401493e+09 | 2.314288e+09 | 5.731808e+09 | 6.124437e+09 | 6.671489e+09 | 7.436724e+09 | ... | 7.224868e+02 | 7.271290e+02 | 7.283779e+02 | 7.266484e+02 | 7.228327e+02 | 7.176899e+02 | 7.113072e+02 | 7.034274e+02 | 6.940296e+02 | NaN |
| std | 1.211687e+11 | 1.804814e+11 | 1.914083e+11 | 2.059170e+11 | 2.112150e+11 | 1.375059e+11 | 2.215546e+11 | 2.325489e+11 | 2.473986e+11 | 2.660957e+11 | ... | 2.215845e+04 | 2.287990e+04 | 2.352338e+04 | 2.408149e+04 | 2.455897e+04 | 2.496587e+04 | 2.530183e+04 | 2.556069e+04 | 2.574189e+04 | NaN |
| min | -1.435564e+00 | -1.594625e+00 | -3.056522e+00 | -4.032582e+00 | -4.213563e+00 | -3.658569e+00 | -2.950945e+00 | -3.174870e+00 | -3.558749e+00 | -2.973612e+00 | ... | -1.630000e+00 | -1.440000e+00 | -1.260000e+00 | -1.090000e+00 | -9.200000e-01 | -7.800000e-01 | -6.500000e-01 | -5.500000e-01 | -4.500000e-01 | NaN |
| 25% | 8.900000e-01 | 8.853210e+00 | 9.240920e+00 | 9.595200e+00 | 9.861595e+00 | 1.400000e+00 | 9.312615e+00 | 9.519913e+00 | 1.000000e+01 | 1.000000e+01 | ... | 3.000000e-02 | 3.000000e-02 | 2.000000e-02 | 2.000000e-02 | 1.000000e-02 | 1.000000e-02 | 1.000000e-02 | 1.000000e-02 | 1.000000e-02 | NaN |
| 50% | 6.317724e+00 | 6.316240e+01 | 6.655139e+01 | 6.969595e+01 | 7.087760e+01 | 9.677420e+00 | 7.101590e+01 | 7.133326e+01 | 7.290512e+01 | 7.510173e+01 | ... | 2.300000e-01 | 2.300000e-01 | 2.300000e-01 | 2.300000e-01 | 2.300000e-01 | 2.300000e-01 | 2.300000e-01 | 2.300000e-01 | 2.200000e-01 | NaN |
| 75% | 6.251250e+01 | 5.655200e+04 | 5.863650e+04 | 6.202900e+04 | 6.383675e+04 | 7.854163e+01 | 5.682800e+04 | 5.739175e+04 | 5.940425e+04 | 6.411500e+04 | ... | 7.505000e+00 | 7.500000e+00 | 7.300000e+00 | 7.100000e+00 | 6.722500e+00 | 6.080000e+00 | 5.462500e+00 | 4.680000e+00 | 4.032500e+00 | NaN |
| max | 1.903929e+13 | 1.986457e+13 | 2.100916e+13 | 2.238367e+13 | 2.282991e+13 | 2.300634e+13 | 2.424128e+13 | 2.521383e+13 | 2.622101e+13 | 2.730873e+13 | ... | 2.951569e+06 | 3.070879e+06 | 3.169711e+06 | 3.246239e+06 | 3.301586e+06 | 3.337871e+06 | 3.354746e+06 | 3.351887e+06 | 3.330484e+06 | NaN |
8 rows × 66 columns
info_data(data)
------------------------------------------------------------------------------------------------------ LES DONNES Nombre de lignes : 886930 lignes Nombre de colonnes : 70 colonnes Nombre d'éléments dupliqués dans le dataframe : 0 eléments Nombre total de données manquantes dans le dataframe : 53455179 données manquantes sur 62085100 (86.1%)
info_data_na(data)
Nombre total de données manquantes dans le dataframe : 53455179 données manquantes sur 62085100 (86.1%) ------------------------------------------------------------------------------------------------------ Visualisation des données manquantes
# data['Indicator Name'].unique().tolist()
# Nombre d'indicateurs Indicator Name
data['Indicator Name'].nunique()
3665
data['Country Code'].nunique()
242
data['Country Name'].nunique()
242
# 3665 indicateurs * 242 pays et zones = notre nb le ligne
data['Indicator Code'].nunique() * data['Country Name'].nunique() == data.shape[0]
True
# On a le même nombre d'indicateurs dans data et dans serie
serie['Series Code'].nunique() == data['Indicator Code'].nunique()
True
data['Country Code'].nunique() == country['Country Code'].nunique()
# Les code ISO-3 présents ne sont pas les mêmes dans ces 2 fichiers
False
# On recheche la liste des pays qui sont différents entre ces 2 fichiers
df_pays_en_trop = data[~data['Country Code'].isin(country['Country Code'])][['Country Name','Country Code']]
df_pays_en_trop.nunique()
Country Name 1 Country Code 1 dtype: int64
# Pays en trop dans EdStatsData.csv est 'British Virgin Islands'
df_pays_en_trop.drop_duplicates(inplace=True)
df_pays_en_trop
| Country Name | Country Code | |
|---|---|---|
| 190580 | British Virgin Islands | VGB |
data['Country Code'].nunique() == country_Series['CountryCode'].nunique()
# Les code ISO-3 présents ne sont pas les mêmes dans ces 2 fichiers
False
# On recheche la liste des pays qui sont différents entre ces 2 fichiers
df_pays_en_trop = data[~data['Country Code'].isin(country_Series['CountryCode'])][['Country Name','Country Code']]
# nombre de Country Code en trop
df_pays_en_trop.nunique()
Country Name 31 Country Code 31 dtype: int64
df_pays_en_trop.drop_duplicates(inplace=True)
df_pays_en_trop
| Country Name | Country Code | |
|---|---|---|
| 0 | Arab World | ARB |
| 3665 | East Asia & Pacific | EAS |
| 7330 | East Asia & Pacific (excluding high income) | EAP |
| 10995 | Euro area | EMU |
| 14660 | Europe & Central Asia | ECS |
| 18325 | Europe & Central Asia (excluding high income) | ECA |
| 21990 | European Union | EUU |
| 25655 | Heavily indebted poor countries (HIPC) | HPC |
| 29320 | High income | HIC |
| 32985 | Latin America & Caribbean | LCN |
| 36650 | Latin America & Caribbean (excluding high income) | LAC |
| 40315 | Least developed countries: UN classification | LDC |
| 43980 | Low & middle income | LMY |
| 47645 | Low income | LIC |
| 51310 | Lower middle income | LMC |
| 54975 | Middle East & North Africa | MEA |
| 58640 | Middle East & North Africa (excluding high inc... | MNA |
| 62305 | Middle income | MIC |
| 65970 | North America | NAC |
| 69635 | OECD members | OED |
| 73300 | South Asia | SAS |
| 76965 | Sub-Saharan Africa | SSF |
| 80630 | Sub-Saharan Africa (excluding high income) | SSA |
| 84295 | Upper middle income | UMC |
| 87960 | World | WLD |
| 190580 | British Virgin Islands | VGB |
| 274875 | Curacao | CUW |
| 593730 | Nauru | NRU |
| 718340 | Sint Maarten (Dutch part) | SXM |
| 740330 | South Sudan | SSD |
| 758655 | St. Martin (French part) | MAF |
# Zones et indication revennus : suppression des subdivision de pays
liste_zones_et_income=df_pays_en_trop['Country Code'].to_list()
print("liste des codes ISO-3 des zones pays et incomes :",liste_zones_et_income)
liste des codes ISO-3 des zones pays et incomes : ['ARB', 'EAS', 'EAP', 'EMU', 'ECS', 'ECA', 'EUU', 'HPC', 'HIC', 'LCN', 'LAC', 'LDC', 'LMY', 'LIC', 'LMC', 'MEA', 'MNA', 'MIC', 'NAC', 'OED', 'SAS', 'SSF', 'SSA', 'UMC', 'WLD', 'VGB', 'CUW', 'NRU', 'SXM', 'SSD', 'MAF']
# Répartition du nombre de données renseignées par années pour tous les indicateurs et tous les pays
plt.figure(figsize=(15, 10))
donnees_dispo = data.loc[:,'1970':'2100'].notnull().sum()
ax1 = donnees_dispo.plot(kind='bar', color='SteelBlue')
ax1.set_title('Répartition du nombre de données par année',fontweight='bold')
ax1.set_ylabel('Nombre de données')
plt.savefig("assets/graphiques/EDA_10_Répartition du nombre de données par année.jpg", bbox_inches='tight')
ax1.grid(False)
# On créer des colonnes par decennies
data['1970s'] = data.loc[:,'1970':'1979'].mean(1)
data['1980s'] = data.loc[:,'1980':'1989'].mean(1)
data['1990s'] = data.loc[:,'1990':'1999'].mean(1)
data['2000s'] = data.loc[:,'2000':'2009'].mean(1)
data['2010s'] = data.loc[:,'2010':'2017'].mean(1)
data['2020s'] = data.loc[:,['2020','2025']].mean(1)
data['2030s'] = data.loc[:,['2030','2035']].mean(1)
data['2040s'] = data.loc[:,['2040','2045']].mean(1)
data['2050s'] = data.loc[:,['2050','2055']].mean(1)
data['2060s'] = data.loc[:,['2060','2065']].mean(1)
data['2070s'] = data.loc[:,['2070','2075']].mean(1)
data['2080s'] = data.loc[:,['2080','2085']].mean(1)
data['2090s'] = data.loc[:,['2090','2095']].mean(1)
data['2100s'] = data.loc[:,'2050']
# On crée df_decennies de data pour travailler dessus et regrouper les années par decennies
#ddata_decennie pour annalyser les décennies
df_decennies = data.copy()
# On affiche les colonnes
df_decennies.columns
# Il manque les années 2018,2019
# A partir de 2020, 2 colonnes par décennie
Index(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code',
'1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978',
'1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987',
'1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996',
'1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005',
'2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014',
'2015', '2016', '2017', '2020', '2025', '2030', '2035', '2040', '2045',
'2050', '2055', '2060', '2065', '2070', '2075', '2080', '2085', '2090',
'2095', '2100', 'Unnamed: 69', '1970s', '1980s', '1990s', '2000s',
'2010s', '2020s', '2030s', '2040s', '2050s', '2060s', '2070s', '2080s',
'2090s', '2100s'],
dtype='object')
df_decennies.drop(['1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978',
'1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987',
'1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996',
'1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005',
'2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014',
'2015', '2016', '2017', '2020', '2025', '2030', '2035', '2040', '2045',
'2050', '2055', '2060', '2065', '2070', '2075', '2080', '2085', '2090',
'2095', '2100'], axis=1, inplace=True)
# Nombre total de données - le nombre de données manquante par décénies
decennies = ['1970s', '1980s', '1990s', '2000s', '2010s', '2020s', '2030s', '2040s','2050s', '2060s', '2070s', '2080s', '2090s', '2100s']
plt.figure(figsize=(16, 4))
plt.title('Nombre de données significatives par décennie', size=15,fontweight='bold')
sns.set_style('whitegrid')
sns.barplot(y = df_decennies.shape[0]-df_decennies[decennies].isna().sum().values, x = df_decennies[decennies].isna().sum().index)
plt.savefig("assets/graphiques/EDA_11_Répartition du nombre de données.jpg", bbox_inches='tight')
plt.figure(figsize=(8, 6))
donnees = df_decennies.loc[:,'1970s':'2010s'].notnull().sum()
ax1 = donnees.plot(kind='bar', color='SteelBlue')
ax1.set_title('Répartition du nombre de données par décennies',weight='bold')
ax1.set_ylabel('Nombre de données')
plt.xticks(rotation = 0)
for i in range(5):
plt.text(i-0.23,donnees[i]+2000, round(donnees[i],1)) # on place en fonction des coordonnées
ax1.grid(False)
plt.savefig("assets/graphiques/EDA_12_Répartition du nombre de données par decennies.jpg", bbox_inches='tight')
plt.show()
data.columns
Index(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code',
'1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978',
'1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987',
'1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996',
'1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005',
'2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014',
'2015', '2016', '2017', '2020', '2025', '2030', '2035', '2040', '2045',
'2050', '2055', '2060', '2065', '2070', '2075', '2080', '2085', '2090',
'2095', '2100', 'Unnamed: 69', '1970s', '1980s', '1990s', '2000s',
'2010s', '2020s', '2030s', '2040s', '2050s', '2060s', '2070s', '2080s',
'2090s', '2100s'],
dtype='object')
# On fait une jointure de country et data
data = data.merge(right=country[['Country Code','Region','Income Group']],
on='Country Code',
how='left')
# data.drop(['Unnamed: 69'], axis=1, inplace=True)
data.columns
Index(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code',
'1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978',
'1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987',
'1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996',
'1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005',
'2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014',
'2015', '2016', '2017', '2020', '2025', '2030', '2035', '2040', '2045',
'2050', '2055', '2060', '2065', '2070', '2075', '2080', '2085', '2090',
'2095', '2100', 'Unnamed: 69', '1970s', '1980s', '1990s', '2000s',
'2010s', '2020s', '2030s', '2040s', '2050s', '2060s', '2070s', '2080s',
'2090s', '2100s', 'Region', 'Income Group'],
dtype='object')
# Repartition des données par région pour la décénnie 2010s
target_decade = ['2010','2011','2012','2013','2014','2015','2016','2017']
data.groupby('Region')[target_decade].count().sum(axis=1).sort_values().reset_index()
| Region | 0 | |
|---|---|---|
| 0 | North America | 11386 |
| 1 | South Asia | 36446 |
| 2 | Middle East & North Africa | 95397 |
| 3 | East Asia & Pacific | 118810 |
| 4 | Latin America & Caribbean | 157163 |
| 5 | Sub-Saharan Africa | 205030 |
| 6 | Europe & Central Asia | 263389 |
# Taux de remplissage
nb_tt = data.shape[0]
nb_nonnull=data.copy()[target_decade].count()
# Taux de données non nulle par année pour 2010s
df_2010s = pd.DataFrame({'annees':nb_nonnull.index, 'nb_nonnull':nb_nonnull.values})
# % donnee null par
df_2010s['%_nonnull']=round((df_2010s['nb_nonnull'])*100/nb_tt,2)
# Nb nan pour les années cibles
df_2010s['%_NaN']=round(100-df_2010s['%_nonnull'],2)
# Definir les années comme index
df_2010s = df_2010s.set_index('annees')
df_2010s
| nb_nonnull | %_nonnull | %_NaN | |
|---|---|---|---|
| annees | |||
| 2010 | 242442 | 27.33 | 72.67 |
| 2011 | 146012 | 16.46 | 83.54 |
| 2012 | 147264 | 16.60 | 83.40 |
| 2013 | 137509 | 15.50 | 84.50 |
| 2014 | 113789 | 12.83 | 87.17 |
| 2015 | 131058 | 14.78 | 85.22 |
| 2016 | 16460 | 1.86 | 98.14 |
| 2017 | 143 | 0.02 | 99.98 |
# Graphique Taux de remplissage
from matplotlib import rc
plt.figure(figsize=(8, 6))
rc('font', weight='bold')
barWidth = 0.9
annees = ['2010','2011','2012','2013','2014','2015','2016','2017']
taux_remplissage=df_2010s['%_nonnull']
taux_NaN=df_2010s['%_NaN']
plt.bar(annees, taux_remplissage, barWidth, label='Données', color='steelblue')
plt.bar(annees, taux_NaN, barWidth, bottom=taux_remplissage, label='NaN', color='gray')
plt.ylabel('%')
names = ['2010','2011','2012','2013','2014','2015','2016','2017']
r = [0,1,2,3,4,5,6,7]
plt.xticks(r, names, fontweight='bold')
plt.title('Taux de remplissage entre 2010 et 2017',fontweight='bold',size=15)
plt.legend(bbox_to_anchor=(1, 1),loc = 2)
for i, var in enumerate(df_2010s['%_nonnull']):
plt.text(i-0.4,8,str(var)+"%",color='white',fontweight='bold')
plt.grid(False)
plt.savefig("assets/graphiques/EDA_13_Taux de remplissage 2010-2017.jpg", bbox_inches='tight')
plt.show()
**Bilan de : `EdStatsData.csv`**:
Les données chargées à partir du site worldbank.org permettent d'obtenir un historique d'indicateurs propres à notre problématique métier l'éducation selon les pays, l'économie, la démographie, la richesse, la disponibilité technologique, variables qui permettent de répondre aux problématiques de l'entreprise
Détails d'analyse de notre jeux de données :
EdStatsCountry.csv
Notions exploitables pour notre analyse
EdStatsCountry-Series.csv
EdStatsData.csv
Malgré les valeurs manquantes, ce jeu de données est essentiel dans notre analyse pour répondre aux problématiques.
EdStatsSeries.csv
EdStatsFootNote.csv
Les années 2000 à 2015 apparaissent comme les années qui sont à la fois les dernières valeurs exploitables et les mieux renseignées.
df_taux_indicateurs = data.copy()
annees =['2010','2011','2012','2013','2014','2015']
df_taux_indicateurs=df_taux_indicateurs[~df_taux_indicateurs['Country Code'].isin(liste_zones_et_income)]
# Compter le nombre d'indicateur présents par années
df_taux_indicateurs = df_taux_indicateurs.groupby('Indicator Code')[annees].count()
df_taux_indicateurs['nb_indicateur']=df_taux_indicateurs[annees].sum(axis=1)
df_taux_indicateurs['%'] = (df_taux_indicateurs['nb_indicateur']/(data['Country Name'].nunique()*len(annees)))*100
df_taux_indicateurs.head()
| 2010 | 2011 | 2012 | 2013 | 2014 | 2015 | nb_indicateur | % | |
|---|---|---|---|---|---|---|---|---|
| Indicator Code | ||||||||
| BAR.NOED.1519.FE.ZS | 144 | 0 | 0 | 0 | 0 | 0 | 144 | 9.917355 |
| BAR.NOED.1519.ZS | 144 | 0 | 0 | 0 | 0 | 0 | 144 | 9.917355 |
| BAR.NOED.15UP.FE.ZS | 144 | 0 | 0 | 0 | 0 | 0 | 144 | 9.917355 |
| BAR.NOED.15UP.ZS | 144 | 0 | 0 | 0 | 0 | 0 | 144 | 9.917355 |
| BAR.NOED.2024.FE.ZS | 144 | 0 | 0 | 0 | 0 | 0 | 144 | 9.917355 |
# On crée une colonne total = nombre d'indicateur pour tous les pays par indicateur
# df_taux_indicateurs.sort_values(by ='%',ascending=False)
print(f"Nombre de Country Name : {data['Country Name'].nunique()}")
print(f"Nombre d'années : {len(annees)}")
Nombre de Country Name : 242 Nombre d'années : 6
df_taux_indicateurs[['nb_indicateur','%']].describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| nb_indicateur | 3665.0 | 237.404366 | 302.391383 | 0.0 | 15.000000 | 135.000000 | 332.000000 | 1262.000000 |
| % | 3665.0 | 16.350163 | 20.825853 | 0.0 | 1.033058 | 9.297521 | 22.865014 | 86.914601 |
# Taux de remplissage
sns.kdeplot(data=df_taux_indicateurs, x="%")
plt.grid(False)
plt.xlim([0,100])
plt.title('Taux de remplissage des indicateurs',weight='bold',size=15)
plt.savefig("assets/graphiques/PREPROCESSING_1_Taux de remplissage des indicateurs.jpg", bbox_inches='tight')
plt.show()
# On garde tous les indicateur >50 % ==> Seuil = 50%
df_indicateurs_reduit = df_taux_indicateurs[df_taux_indicateurs['%']>80]
df_indicateurs_reduit['Indicator Code'] = df_indicateurs_reduit.index
print(f"Nombre d'indicateurs restants : {df_indicateurs_reduit.shape[0]}")
Nombre d'indicateurs restants : 16
df_indicateurs_reduit
| 2010 | 2011 | 2012 | 2013 | 2014 | 2015 | nb_indicateur | % | Indicator Code | |
|---|---|---|---|---|---|---|---|---|---|
| Indicator Code | |||||||||
| IT.NET.USER.P2 | 202 | 204 | 202 | 201 | 201 | 201 | 1211 | 83.402204 | IT.NET.USER.P2 |
| NY.GDP.MKTP.CD | 202 | 202 | 198 | 199 | 197 | 196 | 1194 | 82.231405 | NY.GDP.MKTP.CD |
| NY.GDP.MKTP.KD | 202 | 198 | 196 | 196 | 194 | 193 | 1179 | 81.198347 | NY.GDP.MKTP.KD |
| NY.GDP.PCAP.CD | 202 | 202 | 198 | 199 | 197 | 196 | 1194 | 82.231405 | NY.GDP.PCAP.CD |
| NY.GDP.PCAP.KD | 202 | 198 | 196 | 196 | 194 | 193 | 1179 | 81.198347 | NY.GDP.PCAP.KD |
| SE.PRM.AGES | 203 | 203 | 203 | 203 | 203 | 203 | 1218 | 83.884298 | SE.PRM.AGES |
| SE.PRM.DURS | 203 | 203 | 203 | 203 | 203 | 203 | 1218 | 83.884298 | SE.PRM.DURS |
| SE.SEC.AGES | 200 | 199 | 199 | 200 | 200 | 200 | 1198 | 82.506887 | SE.SEC.AGES |
| SE.SEC.DURS | 200 | 199 | 199 | 200 | 200 | 200 | 1198 | 82.506887 | SE.SEC.DURS |
| SE.SEC.DURS.LO | 200 | 199 | 199 | 200 | 200 | 200 | 1198 | 82.506887 | SE.SEC.DURS.LO |
| SE.SEC.DURS.UP | 200 | 199 | 199 | 200 | 200 | 200 | 1198 | 82.506887 | SE.SEC.DURS.UP |
| SP.POP.GROW | 211 | 210 | 210 | 210 | 210 | 210 | 1261 | 86.845730 | SP.POP.GROW |
| SP.POP.TOTL | 211 | 211 | 210 | 210 | 210 | 210 | 1262 | 86.914601 | SP.POP.TOTL |
| UIS.THAGE.0 | 200 | 200 | 200 | 200 | 198 | 198 | 1196 | 82.369146 | UIS.THAGE.0 |
| UIS.THAGE.3.A.GPV | 200 | 199 | 199 | 200 | 199 | 198 | 1195 | 82.300275 | UIS.THAGE.3.A.GPV |
| UIS.THDUR.0 | 200 | 200 | 200 | 200 | 200 | 200 | 1200 | 82.644628 | UIS.THDUR.0 |
On recherche des indicateurs pour 4 critères d'analyse :
# Création d'un dataframe de recherche
df_search = serie[['Series Code', 'Indicator Name','Topic','Long definition']]
# Target 15-24 ans
liste_1524 = []
for x in df_search['Series Code']:
if "1524" in x:
liste_1524.append(x)
df_1524 = df_search[df_search['Series Code'].isin(liste_1524)][['Series Code','Indicator Name','Topic','Long definition']]
# On affiche la selection
pd.set_option('max_colwidth', None)
df_1524
| Series Code | Indicator Name | Topic | Long definition | |
|---|---|---|---|---|
| 2209 | SE.ADT.1524.LT.FE.ZS | Youth literacy rate, population 15-24 years, female (%) | Literacy | Number of females age 15 to 24 years who can both read and write with understanding a short simple statement on their everyday life, divided by the female population in that age group. Generally, ‘literacy’ also encompasses ‘numeracy’, the ability to make simple arithmetic calculations. Divide the number of people aged 15 to 24 years who are literate by the total population in the same age group and multiply the result by 100. |
| 2210 | SE.ADT.1524.LT.FM.ZS | Youth literacy rate, population 15-24 years, gender parity index (GPI) | Literacy | Ratio of female youth literacy rate to male youth literacy rate. It is calculated by dividing the female value for the indicator by the male value for the indicator. A GPI equal to 1 indicates parity between females and males. In general, a value less than 1 indicates disparity in favor of males and a value greater than 1 indicates disparity in favor of females. |
| 2211 | SE.ADT.1524.LT.MA.ZS | Youth literacy rate, population 15-24 years, male (%) | Literacy | Number of males age 15 to 24 years who can both read and write with understanding a short simple statement on their everyday life, divided by the male population in that age group. Generally, ‘literacy’ also encompasses ‘numeracy’, the ability to make simple arithmetic calculations. Divide the number of people aged 15 to 24 years who are literate by the total population in the same age group and multiply the result by 100. |
| 2212 | SE.ADT.1524.LT.ZS | Youth literacy rate, population 15-24 years, both sexes (%) | Literacy | Number of people age 15 to 24 years who can both read and write with understanding a short simple statement on their everyday life, divided by the population in that age group. Generally, ‘literacy’ also encompasses ‘numeracy’, the ability to make simple arithmetic calculations. Divide the number of people aged 15 to 24 years who are literate by the total population in the same age group and multiply the result by 100. |
| 2504 | SP.POP.1524.FE.UN | Population, ages 15-24, female | Population | Population, ages 15-24, female is the total number of females age 15-24. |
| 2505 | SP.POP.1524.MA.UN | Population, ages 15-24, male | Population | Population, ages 15-24, male is the total number of males age 15-24. |
| 2506 | SP.POP.1524.TO.UN | Population, ages 15-24, total | Population | Population, ages 15-24, total is the total population age 15-24. |
Indicateur retenu : Population 15-20 ans : SP.POP.1524.TO.UN : Population, ages 15-24, total is the total population age 15-24.
# Target : Infrastructure Comprehensive list
liste_IT = [x for x in df_search["Long definition"] if "Internet" in x]
df_IT = df_search[df_search['Long definition'].isin(liste_IT)][['Series Code','Indicator Name','Topic','Long definition']]
# On affiche la selection
pd.set_option('max_colwidth', None)
df_IT
| Series Code | Indicator Name | Topic | Long definition | |
|---|---|---|---|---|
| 611 | IT.NET.USER.P2 | Internet users (per 100 people) | Infrastructure: Communications | Internet users are individuals who have used the Internet (from any location) in the last 3 months. The Internet can be used via a computer, mobile phone, personal digital assistant, games machine, digital TV etc. |
Indicateur retenu : Intenet User : IT.NET.USER.P2 : Internet users are individuals who have used the Internet
# Target : Economie Comprehensive list
# first filter NY = incomplet ==> NY.GNP
liste_ECO = [x for x in df_search["Series Code"] if ("NY.GNP") in x]
df_ECO = df_search[df_search['Series Code'].isin(liste_ECO)][['Series Code','Indicator Name','Topic','Long definition']]
# On affiche la selection
pd.set_option('max_colwidth', None)
df_ECO
| Series Code | Indicator Name | Topic | Long definition | |
|---|---|---|---|---|
| 1666 | NY.GNP.MKTP.CD | GNI (current US$) | Economic Policy & Debt: National accounts: US$ at current prices: Aggregate indicators | GNI (formerly GNP) is the sum of value added by all resident producers plus any product taxes (less subsidies) not included in the valuation of output plus net receipts of primary income (compensation of employees and property income) from abroad. Data are in current U.S. dollars. |
| 1667 | NY.GNP.MKTP.PP.CD | GNI, PPP (current international $) | Economic Policy & Debt: Purchasing power parity | PPP GNI (formerly PPP GNP) is gross national income (GNI) converted to international dollars using purchasing power parity rates. An international dollar has the same purchasing power over GNI as a U.S. dollar has in the United States. Gross national income is the sum of value added by all resident producers plus any product taxes (less subsidies) not included in the valuation of output plus net receipts of primary income (compensation of employees and property income) from abroad. Data are in current international dollars. For most economies PPP figures are extrapolated from the 2011 International Comparison Program (ICP) benchmark estimates or imputed using a statistical model based on the 2011 ICP. For 47 high- and upper middle-income economies conversion factors are provided by Eurostat and the Organisation for Economic Co-operation and Development (OECD). |
| 1668 | NY.GNP.PCAP.CD | GNI per capita, Atlas method (current US$) | Economic Policy & Debt: National accounts: Atlas GNI & GNI per capita | GNI per capita (formerly GNP per capita) is the gross national income, converted to U.S. dollars using the World Bank Atlas method, divided by the midyear population. GNI is the sum of value added by all resident producers plus any product taxes (less subsidies) not included in the valuation of output plus net receipts of primary income (compensation of employees and property income) from abroad. GNI, calculated in national currency, is usually converted to U.S. dollars at official exchange rates for comparisons across economies, although an alternative rate is used when the official exchange rate is judged to diverge by an exceptionally large margin from the rate actually applied in international transactions. To smooth fluctuations in prices and exchange rates, a special Atlas method of conversion is used by the World Bank. This applies a conversion factor that averages the exchange rate for a given year and the two preceding years, adjusted for differences in rates of inflation between the country, and through 2000, the G-5 countries (France, Germany, Japan, the United Kingdom, and the United States). From 2001, these countries include the Euro area, Japan, the United Kingdom, and the United States. |
| 1669 | NY.GNP.PCAP.PP.CD | GNI per capita, PPP (current international $) | Economic Policy & Debt: Purchasing power parity | GNI per capita based on purchasing power parity (PPP). PPP GNI is gross national income (GNI) converted to international dollars using purchasing power parity rates. An international dollar has the same purchasing power over GNI as a U.S. dollar has in the United States. GNI is the sum of value added by all resident producers plus any product taxes (less subsidies) not included in the valuation of output plus net receipts of primary income (compensation of employees and property income) from abroad. Data are in current international dollars based on the 2011 ICP round. |
Indicateur retenu : ECO : NY.GNP.PCAP.PP.CD : GNI per capita based on purchasing power parity (PPP).
# Target : Etudiant engagés post Bac
# Deuxième filtre filtre
liste_Secondaire = [ x for x in df_search['Series Code'] if ("SEC.ENRL") in x]
df_Secondaire = df_search[df_search['Series Code'].isin(liste_Secondaire)][['Series Code','Indicator Name','Topic','Long definition']]
# On affiche la selection
pd.set_option('max_colwidth', None)
df_Secondaire
| Series Code | Indicator Name | Topic | Long definition | |
|---|---|---|---|---|
| 2294 | SE.SEC.ENRL | Enrolment in secondary education, both sexes (number) | Secondary | Total number of students enrolled at public and private secondary education institutions regardless of age. |
| 2295 | SE.SEC.ENRL.FE | Enrolment in secondary education, female (number) | Secondary | Total number of female students enrolled at public and private secondary education institutions regardless of age. |
| 2296 | SE.SEC.ENRL.FE.VO.ZS | Percentage of female students in secondary education enrolled in vocational programmes, female (%) | Secondary | Total number of female students enrolled in vocational programmes at the secondary education level, expressed as a percentage of the total number of female students enrolled in all programmes (vocational and general) at the secondary level. Vocational education is designed for learners to acquire the knowledge, skills and competencies specific to a particular occupation or trade or class of occupations or trades. Vocational education may have work-based components (e.g. apprenticeships). Successful completion of such programmes leads to labour-market relevant vocational qualifications acknowledged as occupationally-oriented by the relevant national authorities and/or the labour market. |
| 2297 | SE.SEC.ENRL.FE.ZS | Percentage of students in secondary education who are female (%) | Secondary | Number of female students enrolled in all secondary education programmes expressed as a percentage of the total number of students (male and female) enrolled at the secondary education level in a given school year. |
| 2298 | SE.SEC.ENRL.GC | Enrolment in secondary general, both sexes (number) | Secondary | Total number of students enrolled in general programmes at public and private secondary education institutions regardless of age. |
| 2299 | SE.SEC.ENRL.GC.FE | Enrolment in secondary general, female (number) | Secondary | Total number of female students enrolled in general programmes at public and private secondary education institutions regardless of age. |
| 2300 | SE.SEC.ENRL.GC.FE.ZS | Percentage of students in secondary general education who are female (%) | Secondary | Number of female students enrolled in general programmes at the secondary education level expressed as a percentage of the total number of students (male and female) enrolled in general programmes at the secondary education level in a given school year. |
| 2301 | SE.SEC.ENRL.MA.VO.ZS | Percentage of male students in secondary education enrolled in vocational programmes, male (%) | Secondary | Total number of male students enrolled in vocational programmes at the secondary education level, expressed as a percentage of the total number of male students enrolled in all programmes (vocational and general) at the secondary level. Vocational education is designed for learners to acquire the knowledge, skills and competencies specific to a particular occupation or trade or class of occupations or trades. Vocational education may have work-based components (e.g. apprenticeships). Successful completion of such programmes leads to labour-market relevant vocational qualifications acknowledged as occupationally-oriented by the relevant national authorities and/or the labour market. |
| 2302 | SE.SEC.ENRL.TC.ZS | Pupil-teacher ratio in secondary education (headcount basis) | Teachers | Average number of pupils per teacher at a given level of education, based on headcounts of both pupils and teachers. Divide the total number of pupils enrolled at the specified level of education by the number of teachers at the same level. In computing and interpreting this indicator, one should take into account the existence of part-time teaching, school-shifts, multi-grade classes and other practices that may affect the precision and meaningfulness of pupil-teacher ratios. When feasible, the number of part-time teachers is converted to ‘full-time equivalent’ teachers; a double-shift teacher is counted twice, etc. Teachers are defined as persons whose professional activity involves the transmitting of knowledge, attitudes and skills that are stipulated in a formal curriculum programme to students enrolled in a formal educational institution. |
| 2303 | SE.SEC.ENRL.VO | Enrolment in secondary vocational, both sexes (number) | Secondary | Total number of students enrolled in vocational programmes at public and private secondary education institutions. Vocational education is designed for learners to acquire the knowledge, skills and competencies specific to a particular occupation or trade or class of occupations or trades. Vocational education may have work-based components (e.g. apprenticeships). Successful completion of such programmes leads to labour-market relevant vocational qualifications acknowledged as occupationally-oriented by the relevant national authorities and/or the labour market. |
| 2304 | SE.SEC.ENRL.VO.FE | Enrolment in secondary vocational, female (number) | Secondary | Total number of female students enrolled in vocational programmes at public and private secondary education institutions. Vocational education is designed for learners to acquire the knowledge, skills and competencies specific to a particular occupation or trade or class of occupations or trades. Vocational education may have work-based components (e.g. apprenticeships). Successful completion of such programmes leads to labour-market relevant vocational qualifications acknowledged as occupationally-oriented by the relevant national authorities and/or the labour market. |
| 2305 | SE.SEC.ENRL.VO.FE.ZS | Percentage of students in secondary vocational education who are female (%) | Secondary | Number of female students enrolled in vocational programmes at the secondary education level expressed as a percentage of the total number of students (male and female) enrolled in vocational programmes at the secondary education level in a given school year. |
| 2306 | SE.SEC.ENRL.VO.ZS | Percentage of students in secondary education enrolled in vocational programmes, both sexes (%) | Secondary | Total number of students enrolled in vocational programmes at the secondary education level, expressed as a percentage of the total number of students enrolled in all programmes (vocational and general) at the secondary level. Vocational education is designed for learners to acquire the knowledge, skills and competencies specific to a particular occupation or trade or class of occupations or trades. Vocational education may have work-based components (e.g. apprenticeships). Successful completion of such programmes leads to labour-market relevant vocational qualifications acknowledged as occupationally-oriented by the relevant national authorities and/or the labour market. |
# Target : Etudiant engagés post Bac
# Deuxième filtre filtre
liste_PostBac = [ x for x in df_search['Series Code'] if ("TER.ENR") in x]
df_PostBac = df_search[df_search['Series Code'].isin(liste_PostBac)][['Series Code','Indicator Name','Topic','Long definition']]
# On affiche la selection
pd.set_option('max_colwidth', None)
df_PostBac
| Series Code | Indicator Name | Topic | Long definition | |
|---|---|---|---|---|
| 2332 | SE.TER.ENRL | Enrolment in tertiary education, all programmes, both sexes (number) | Tertiary | The total number of students enrolled at public and private tertiary education institutions. |
| 2333 | SE.TER.ENRL.FE | Enrolment in tertiary education, all programmes, female (number) | Tertiary | The total number of female students enrolled at public and private tertiary education institutions. |
| 2334 | SE.TER.ENRL.FE.ZS | Percentage of students in tertiary education who are female (%) | Tertiary | Number of female students at the tertiary education level (ISCED 5 to 8) expressed as a percentage of the total number of students (male and female) at the tertiary education level (ISCED 5 to 8) in a given school year. |
| 2335 | SE.TER.ENRR | Gross enrolment ratio, tertiary, both sexes (%) | Tertiary | Total enrollment in tertiary education (ISCED 5 to 8), regardless of age, expressed as a percentage of the total population of the five-year age group following on from secondary school leaving. |
| 2336 | SE.TER.ENRR.FE | Gross enrolment ratio, tertiary, female (%) | Tertiary | Total female enrollment in tertiary education (ISCED 5 to 8), regardless of age, expressed as a percentage of the total female population of the five-year age group following on from secondary school leaving. |
| 2337 | SE.TER.ENRR.MA | Gross enrolment ratio, tertiary, male (%) | Tertiary | Total male enrollment in tertiary education (ISCED 5 to 8), regardless of age, expressed as a percentage of the total male population of the five-year age group following on from secondary school leaving. |
# Taux de remplissage sur 2010-2015
indicateurs_retenus=['SP.POP.1524.TO.UN', 'SE.SEC.ENRR', 'SE.TER.ENRR', 'IT.NET.USER.P2', 'NY.GNP.PCAP.PP.CD']
mask=df_indicateurs_reduit['Indicator Code'].isin(indicateurs_retenus)
cols=['Indicator Code','%']
df_indicateurs_reduit[mask][cols]
| Indicator Code | % | |
|---|---|---|
| Indicator Code | ||
| IT.NET.USER.P2 | IT.NET.USER.P2 | 83.402204 |
indicateurs_retenus=['IT.NET.USER.P2', 'NY.GNP.PCAP.PP.CD','SP.POP.1524.TO.UN', 'SE.SEC.ENRR', 'SE.TER.ENRR']
# On travail sur df
df = data
col = df.columns
# On supprime tout ce qui n'est pas pays et les pays na
print(liste_zones_et_income)
['ARB', 'EAS', 'EAP', 'EMU', 'ECS', 'ECA', 'EUU', 'HPC', 'HIC', 'LCN', 'LAC', 'LDC', 'LMY', 'LIC', 'LMC', 'MEA', 'MNA', 'MIC', 'NAC', 'OED', 'SAS', 'SSF', 'SSA', 'UMC', 'WLD', 'VGB', 'CUW', 'NRU', 'SXM', 'SSD', 'MAF']
df.shape
(886930, 86)
df = df[~df['Country Code'].isin(liste_zones_et_income)][col]
df.shape
(773315, 86)
df.columns
Index(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code',
'1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978',
'1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987',
'1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996',
'1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005',
'2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014',
'2015', '2016', '2017', '2020', '2025', '2030', '2035', '2040', '2045',
'2050', '2055', '2060', '2065', '2070', '2075', '2080', '2085', '2090',
'2095', '2100', 'Unnamed: 69', '1970s', '1980s', '1990s', '2000s',
'2010s', '2020s', '2030s', '2040s', '2050s', '2060s', '2070s', '2080s',
'2090s', '2100s', 'Region', 'Income Group'],
dtype='object')
# On réduit le df aux colonnes et dates (2000-2015) qui nous concernent
df = df.loc[:,[ 'Country Name', 'Country Code','Region', 'Indicator Name','Indicator Code','Income Group','2010', '2011','2012', '2013', '2014', '2015']]
df.shape
(773315, 12)
col = df.columns
print(col)
Index(['Country Name', 'Country Code', 'Region', 'Indicator Name',
'Indicator Code', 'Income Group', '2010', '2011', '2012', '2013',
'2014', '2015'],
dtype='object')
# On supprime les indicateurs inutiles
df = df[df['Indicator Code'].isin(indicateurs_retenus)][col]
df.shape
(1055, 12)
msno.bar(df)
<AxesSubplot: >
df[df['Indicator Code'].isin(indicateurs_retenus)][col].groupby(['Indicator Name', 'Indicator Code']).count().reset_index().sort_values(by='Indicator Name',ascending=False)
| Indicator Name | Indicator Code | Country Name | Country Code | Region | Income Group | 2010 | 2011 | 2012 | 2013 | 2014 | 2015 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4 | Population, ages 15-24, total | SP.POP.1524.TO.UN | 211 | 211 | 210 | 210 | 181 | 181 | 181 | 181 | 181 | 181 |
| 3 | Internet users (per 100 people) | IT.NET.USER.P2 | 211 | 211 | 210 | 210 | 202 | 204 | 202 | 201 | 201 | 201 |
| 2 | Gross enrolment ratio, tertiary, both sexes (%) | SE.TER.ENRR | 211 | 211 | 210 | 210 | 133 | 138 | 136 | 124 | 119 | 99 |
| 1 | Gross enrolment ratio, secondary, both sexes (%) | SE.SEC.ENRR | 211 | 211 | 210 | 210 | 145 | 148 | 142 | 133 | 133 | 116 |
| 0 | GNI per capita, PPP (current international $) | NY.GNP.PCAP.PP.CD | 211 | 211 | 210 | 210 | 187 | 187 | 185 | 185 | 183 | 181 |
df.isnull().mean()*100
Country Name 0.000000 Country Code 0.000000 Region 0.473934 Indicator Name 0.000000 Indicator Code 0.000000 Income Group 0.473934 2010 19.620853 2011 18.672986 2012 19.810427 2013 21.895735 2014 22.559242 2015 26.255924 dtype: float64
2015 est notre colone de base, on voit les na concernant cette année
# Df des lignes qui ont toutes des données manquantes tous les ans
df_noData = df.loc[(df['2010'].isnull()) & (df['2011'].isnull()) & (df['2012'].isnull()) & (df['2013'].isnull()) & (df['2014'].isnull()) & (df['2015'].isnull())]
pd.set_option('display.max_rows', None)
# On visualise les pays et les indicateurs concernés par le manque total de données
df_noData.loc[:,['Country Name','Country Code','Indicator Name']]
| Country Name | Country Code | Indicator Name | |
|---|---|---|---|
| 103872 | American Samoa | ASM | GNI per capita, PPP (current international $) |
| 103955 | American Samoa | ASM | Gross enrolment ratio, secondary, both sexes (%) |
| 103959 | American Samoa | ASM | Gross enrolment ratio, tertiary, both sexes (%) |
| 103995 | American Samoa | ASM | Internet users (per 100 people) |
| 105102 | American Samoa | ASM | Population, ages 15-24, total |
| 107537 | Andorra | AND | GNI per capita, PPP (current international $) |
| 107620 | Andorra | AND | Gross enrolment ratio, secondary, both sexes (%) |
| 107624 | Andorra | AND | Gross enrolment ratio, tertiary, both sexes (%) |
| 108767 | Andorra | AND | Population, ages 15-24, total |
| 116097 | Antigua and Barbuda | ATG | Population, ages 15-24, total |
| 125862 | Aruba | ABW | GNI per capita, PPP (current international $) |
| 136940 | Azerbaijan | AZE | Gross enrolment ratio, secondary, both sexes (%) |
| 140609 | Bahamas, The | BHS | Gross enrolment ratio, tertiary, both sexes (%) |
| 171072 | Bermuda | BMU | Population, ages 15-24, total |
| 177259 | Bolivia | BOL | Gross enrolment ratio, tertiary, both sexes (%) |
| 180920 | Bosnia and Herzegovina | BIH | Gross enrolment ratio, secondary, both sexes (%) |
| 180924 | Bosnia and Herzegovina | BIH | Gross enrolment ratio, tertiary, both sexes (%) |
| 184585 | Botswana | BWA | Gross enrolment ratio, secondary, both sexes (%) |
| 213905 | Cambodia | KHM | Gross enrolment ratio, secondary, both sexes (%) |
| 221239 | Canada | CAN | Gross enrolment ratio, tertiary, both sexes (%) |
| 224817 | Cayman Islands | CYM | GNI per capita, PPP (current international $) |
| 224900 | Cayman Islands | CYM | Gross enrolment ratio, secondary, both sexes (%) |
| 224904 | Cayman Islands | CYM | Gross enrolment ratio, tertiary, both sexes (%) |
| 226047 | Cayman Islands | CYM | Population, ages 15-24, total |
| 235812 | Channel Islands | CHI | GNI per capita, PPP (current international $) |
| 235895 | Channel Islands | CHI | Gross enrolment ratio, secondary, both sexes (%) |
| 235899 | Channel Islands | CHI | Gross enrolment ratio, tertiary, both sexes (%) |
| 235935 | Channel Islands | CHI | Internet users (per 100 people) |
| 237042 | Channel Islands | CHI | Population, ages 15-24, total |
| 272462 | Cuba | CUB | GNI per capita, PPP (current international $) |
| 290787 | Djibouti | DJI | GNI per capita, PPP (current international $) |
| 294539 | Dominica | DMA | Gross enrolment ratio, tertiary, both sexes (%) |
| 295682 | Dominica | DMA | Population, ages 15-24, total |
| 312860 | Equatorial Guinea | GNQ | Gross enrolment ratio, secondary, both sexes (%) |
| 312864 | Equatorial Guinea | GNQ | Gross enrolment ratio, tertiary, both sexes (%) |
| 327437 | Faroe Islands | FRO | GNI per capita, PPP (current international $) |
| 327520 | Faroe Islands | FRO | Gross enrolment ratio, secondary, both sexes (%) |
| 327524 | Faroe Islands | FRO | Gross enrolment ratio, tertiary, both sexes (%) |
| 328667 | Faroe Islands | FRO | Population, ages 15-24, total |
| 331189 | Fiji | FJI | Gross enrolment ratio, tertiary, both sexes (%) |
| 342097 | French Polynesia | PYF | GNI per capita, PPP (current international $) |
| 342180 | French Polynesia | PYF | Gross enrolment ratio, secondary, both sexes (%) |
| 342184 | French Polynesia | PYF | Gross enrolment ratio, tertiary, both sexes (%) |
| 343327 | French Polynesia | PYF | Population, ages 15-24, total |
| 345845 | Gabon | GAB | Gross enrolment ratio, secondary, both sexes (%) |
| 345849 | Gabon | GAB | Gross enrolment ratio, tertiary, both sexes (%) |
| 364087 | Gibraltar | GIB | GNI per capita, PPP (current international $) |
| 364170 | Gibraltar | GIB | Gross enrolment ratio, secondary, both sexes (%) |
| 364174 | Gibraltar | GIB | Gross enrolment ratio, tertiary, both sexes (%) |
| 365317 | Gibraltar | GIB | Population, ages 15-24, total |
| 371417 | Greenland | GRL | GNI per capita, PPP (current international $) |
| 371500 | Greenland | GRL | Gross enrolment ratio, secondary, both sexes (%) |
| 371504 | Greenland | GRL | Gross enrolment ratio, tertiary, both sexes (%) |
| 372647 | Greenland | GRL | Population, ages 15-24, total |
| 378747 | Guam | GUM | GNI per capita, PPP (current international $) |
| 378830 | Guam | GUM | Gross enrolment ratio, secondary, both sexes (%) |
| 378834 | Guam | GUM | Gross enrolment ratio, tertiary, both sexes (%) |
| 379977 | Guam | GUM | Population, ages 15-24, total |
| 389825 | Guinea-Bissau | GNB | Gross enrolment ratio, secondary, both sexes (%) |
| 389829 | Guinea-Bissau | GNB | Gross enrolment ratio, tertiary, both sexes (%) |
| 397155 | Haiti | HTI | Gross enrolment ratio, secondary, both sexes (%) |
| 397159 | Haiti | HTI | Gross enrolment ratio, tertiary, both sexes (%) |
| 426475 | Iraq | IRQ | Gross enrolment ratio, secondary, both sexes (%) |
| 426479 | Iraq | IRQ | Gross enrolment ratio, tertiary, both sexes (%) |
| 433722 | Isle of Man | IMN | GNI per capita, PPP (current international $) |
| 433805 | Isle of Man | IMN | Gross enrolment ratio, secondary, both sexes (%) |
| 433809 | Isle of Man | IMN | Gross enrolment ratio, tertiary, both sexes (%) |
| 433845 | Isle of Man | IMN | Internet users (per 100 people) |
| 434952 | Isle of Man | IMN | Population, ages 15-24, total |
| 459460 | Kenya | KEN | Gross enrolment ratio, secondary, both sexes (%) |
| 459464 | Kenya | KEN | Gross enrolment ratio, tertiary, both sexes (%) |
| 463125 | Kiribati | KIR | Gross enrolment ratio, secondary, both sexes (%) |
| 463129 | Kiribati | KIR | Gross enrolment ratio, tertiary, both sexes (%) |
| 464272 | Kiribati | KIR | Population, ages 15-24, total |
| 466707 | Korea, Dem. People’s Rep. | PRK | GNI per capita, PPP (current international $) |
| 474120 | Kosovo | XKX | Gross enrolment ratio, secondary, both sexes (%) |
| 474124 | Kosovo | XKX | Gross enrolment ratio, tertiary, both sexes (%) |
| 474160 | Kosovo | XKX | Internet users (per 100 people) |
| 475267 | Kosovo | XKX | Population, ages 15-24, total |
| 503440 | Libya | LBY | Gross enrolment ratio, secondary, both sexes (%) |
| 503444 | Libya | LBY | Gross enrolment ratio, tertiary, both sexes (%) |
| 507022 | Liechtenstein | LIE | GNI per capita, PPP (current international $) |
| 508252 | Liechtenstein | LIE | Population, ages 15-24, total |
| 536425 | Maldives | MDV | Gross enrolment ratio, secondary, both sexes (%) |
| 548567 | Marshall Islands | MHL | Population, ages 15-24, total |
| 562080 | Micronesia, Fed. Sts. | FSM | Gross enrolment ratio, secondary, both sexes (%) |
| 562084 | Micronesia, Fed. Sts. | FSM | Gross enrolment ratio, tertiary, both sexes (%) |
| 569327 | Monaco | MCO | GNI per capita, PPP (current international $) |
| 569410 | Monaco | MCO | Gross enrolment ratio, secondary, both sexes (%) |
| 569414 | Monaco | MCO | Gross enrolment ratio, tertiary, both sexes (%) |
| 570557 | Monaco | MCO | Population, ages 15-24, total |
| 577887 | Montenegro | MNE | Population, ages 15-24, total |
| 591400 | Namibia | NAM | Gross enrolment ratio, secondary, both sexes (%) |
| 591404 | Namibia | NAM | Gross enrolment ratio, tertiary, both sexes (%) |
| 605977 | New Caledonia | NCL | GNI per capita, PPP (current international $) |
| 606060 | New Caledonia | NCL | Gross enrolment ratio, secondary, both sexes (%) |
| 606064 | New Caledonia | NCL | Gross enrolment ratio, tertiary, both sexes (%) |
| 607207 | New Caledonia | NCL | Population, ages 15-24, total |
| 613394 | Nicaragua | NIC | Gross enrolment ratio, tertiary, both sexes (%) |
| 624302 | Northern Mariana Islands | MNP | GNI per capita, PPP (current international $) |
| 624385 | Northern Mariana Islands | MNP | Gross enrolment ratio, secondary, both sexes (%) |
| 624389 | Northern Mariana Islands | MNP | Gross enrolment ratio, tertiary, both sexes (%) |
| 624425 | Northern Mariana Islands | MNP | Internet users (per 100 people) |
| 625532 | Northern Mariana Islands | MNP | Population, ages 15-24, total |
| 631719 | Oman | OMN | Gross enrolment ratio, tertiary, both sexes (%) |
| 639085 | Palau | PLW | Internet users (per 100 people) |
| 640192 | Palau | PLW | Population, ages 15-24, total |
| 646379 | Papua New Guinea | PNG | Gross enrolment ratio, tertiary, both sexes (%) |
| 669512 | Puerto Rico | PRI | Population, ages 15-24, total |
| 686694 | Samoa | WSM | Gross enrolment ratio, tertiary, both sexes (%) |
| 690272 | San Marino | SMR | GNI per capita, PPP (current international $) |
| 691502 | San Marino | SMR | Population, ages 15-24, total |
| 706162 | Serbia | SRB | Population, ages 15-24, total |
| 709827 | Seychelles | SYC | Population, ages 15-24, total |
| 712349 | Sierra Leone | SLE | Gross enrolment ratio, tertiary, both sexes (%) |
| 716010 | Singapore | SGP | Gross enrolment ratio, secondary, both sexes (%) |
| 716014 | Singapore | SGP | Gross enrolment ratio, tertiary, both sexes (%) |
| 730674 | Solomon Islands | SLB | Gross enrolment ratio, tertiary, both sexes (%) |
| 734252 | Somalia | SOM | GNI per capita, PPP (current international $) |
| 734335 | Somalia | SOM | Gross enrolment ratio, secondary, both sexes (%) |
| 734339 | Somalia | SOM | Gross enrolment ratio, tertiary, both sexes (%) |
| 753807 | St. Kitts and Nevis | KNA | Population, ages 15-24, total |
| 763659 | St. Vincent and the Grenadines | VCT | Gross enrolment ratio, tertiary, both sexes (%) |
| 770989 | Suriname | SUR | Gross enrolment ratio, tertiary, both sexes (%) |
| 785562 | Syrian Arab Republic | SYR | GNI per capita, PPP (current international $) |
| 807639 | Tonga | TON | Gross enrolment ratio, tertiary, both sexes (%) |
| 811300 | Trinidad and Tobago | TTO | Gross enrolment ratio, secondary, both sexes (%) |
| 811304 | Trinidad and Tobago | TTO | Gross enrolment ratio, tertiary, both sexes (%) |
| 825877 | Turks and Caicos Islands | TCA | GNI per capita, PPP (current international $) |
| 825960 | Turks and Caicos Islands | TCA | Gross enrolment ratio, secondary, both sexes (%) |
| 825964 | Turks and Caicos Islands | TCA | Gross enrolment ratio, tertiary, both sexes (%) |
| 826000 | Turks and Caicos Islands | TCA | Internet users (per 100 people) |
| 827107 | Turks and Caicos Islands | TCA | Population, ages 15-24, total |
| 829629 | Tuvalu | TUV | Gross enrolment ratio, tertiary, both sexes (%) |
| 830772 | Tuvalu | TUV | Population, ages 15-24, total |
| 840620 | United Arab Emirates | ARE | Gross enrolment ratio, secondary, both sexes (%) |
| 840624 | United Arab Emirates | ARE | Gross enrolment ratio, tertiary, both sexes (%) |
| 858949 | Vanuatu | VUT | Gross enrolment ratio, tertiary, both sexes (%) |
| 862614 | Venezuela, RB | VEN | Gross enrolment ratio, tertiary, both sexes (%) |
| 866275 | Vietnam | VNM | Gross enrolment ratio, secondary, both sexes (%) |
| 869857 | Virgin Islands (U.S.) | VIR | GNI per capita, PPP (current international $) |
| 869940 | Virgin Islands (U.S.) | VIR | Gross enrolment ratio, secondary, both sexes (%) |
| 869944 | Virgin Islands (U.S.) | VIR | Gross enrolment ratio, tertiary, both sexes (%) |
| 871087 | Virgin Islands (U.S.) | VIR | Population, ages 15-24, total |
| 873522 | West Bank and Gaza | PSE | GNI per capita, PPP (current international $) |
| 880935 | Zambia | ZMB | Gross enrolment ratio, secondary, both sexes (%) |
# faire la liste des codes pays a supprimer
liste_noData = df_noData['Country Code'].unique()
liste_noData
array(['ASM', 'AND', 'ATG', 'ABW', 'AZE', 'BHS', 'BMU', 'BOL', 'BIH',
'BWA', 'KHM', 'CAN', 'CYM', 'CHI', 'CUB', 'DJI', 'DMA', 'GNQ',
'FRO', 'FJI', 'PYF', 'GAB', 'GIB', 'GRL', 'GUM', 'GNB', 'HTI',
'IRQ', 'IMN', 'KEN', 'KIR', 'PRK', 'XKX', 'LBY', 'LIE', 'MDV',
'MHL', 'FSM', 'MCO', 'MNE', 'NAM', 'NCL', 'NIC', 'MNP', 'OMN',
'PLW', 'PNG', 'PRI', 'WSM', 'SMR', 'SRB', 'SYC', 'SLE', 'SGP',
'SLB', 'SOM', 'KNA', 'VCT', 'SUR', 'SYR', 'TON', 'TTO', 'TCA',
'TUV', 'ARE', 'VUT', 'VEN', 'VNM', 'VIR', 'PSE', 'ZMB'],
dtype=object)
df.shape
(1055, 12)
# On supprimes ces pays de notre dataframe
col = df.columns
df = df[~df['Country Code'].isin(liste_noData)][col]
df.shape
(700, 12)
# On crée une nouvelle colonne last_data avec les valeurs de 2015
df['Last_data'] = df['2015']
df.head()
| Country Name | Country Code | Region | Indicator Name | Indicator Code | Income Group | 2010 | 2011 | 2012 | 2013 | 2014 | 2015 | Last_data | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 92877 | Afghanistan | AFG | South Asia | GNI per capita, PPP (current international $) | NY.GNP.PCAP.PP.CD | Low income | 1.590000e+03 | 1.680000e+03 | 1.900000e+03 | 1.900000e+03 | 1.890000e+03 | 1.880000e+03 | 1.880000e+03 |
| 92960 | Afghanistan | AFG | South Asia | Gross enrolment ratio, secondary, both sexes (%) | SE.SEC.ENRR | Low income | 5.324683e+01 | 5.461618e+01 | 5.667734e+01 | 5.668866e+01 | 5.565616e+01 | 5.564441e+01 | 5.564441e+01 |
| 92964 | Afghanistan | AFG | South Asia | Gross enrolment ratio, tertiary, both sexes (%) | SE.TER.ENRR | Low income | NaN | 3.755980e+00 | NaN | NaN | 8.662800e+00 | NaN | NaN |
| 93000 | Afghanistan | AFG | South Asia | Internet users (per 100 people) | IT.NET.USER.P2 | Low income | 4.000000e+00 | 5.000000e+00 | 5.454545e+00 | 5.900000e+00 | 7.000000e+00 | 8.260000e+00 | 8.260000e+00 |
| 94107 | Afghanistan | AFG | South Asia | Population, ages 15-24, total | SP.POP.1524.TO.UN | Low income | 6.090239e+06 | 6.330684e+06 | 6.569239e+06 | 6.803785e+06 | 7.032072e+06 | 7.252785e+06 | 7.252785e+06 |
# Réindexing
df = df.reset_index()
df = df.drop('index',axis=1)
df.head()
| Country Name | Country Code | Region | Indicator Name | Indicator Code | Income Group | 2010 | 2011 | 2012 | 2013 | 2014 | 2015 | Last_data | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | AFG | South Asia | GNI per capita, PPP (current international $) | NY.GNP.PCAP.PP.CD | Low income | 1.590000e+03 | 1.680000e+03 | 1.900000e+03 | 1.900000e+03 | 1.890000e+03 | 1.880000e+03 | 1.880000e+03 |
| 1 | Afghanistan | AFG | South Asia | Gross enrolment ratio, secondary, both sexes (%) | SE.SEC.ENRR | Low income | 5.324683e+01 | 5.461618e+01 | 5.667734e+01 | 5.668866e+01 | 5.565616e+01 | 5.564441e+01 | 5.564441e+01 |
| 2 | Afghanistan | AFG | South Asia | Gross enrolment ratio, tertiary, both sexes (%) | SE.TER.ENRR | Low income | NaN | 3.755980e+00 | NaN | NaN | 8.662800e+00 | NaN | NaN |
| 3 | Afghanistan | AFG | South Asia | Internet users (per 100 people) | IT.NET.USER.P2 | Low income | 4.000000e+00 | 5.000000e+00 | 5.454545e+00 | 5.900000e+00 | 7.000000e+00 | 8.260000e+00 | 8.260000e+00 |
| 4 | Afghanistan | AFG | South Asia | Population, ages 15-24, total | SP.POP.1524.TO.UN | Low income | 6.090239e+06 | 6.330684e+06 | 6.569239e+06 | 6.803785e+06 | 7.032072e+06 | 7.252785e+06 | 7.252785e+06 |
#On parcours toute les lignes et on liste l'origine des données
nb_nan_LData = 0
nb_data_recup_2014 = 0
nb_data_recup_2013 = 0
nb_data_recup_2012 = 0
nb_data_recup_2011 = 0
nb_data_recup_2010 = 0
for i in range(len(df)):
# Tester si nan
if(pd.isnull(df.loc[i,'Last_data'])):
nb_nan_LData +=1
if(pd.isnull(df.loc[i,'Last_data']))&(pd.notnull(df.loc[i,'2014'])):
nb_data_recup_2014 +=1
if(pd.isnull(df.loc[i,'Last_data']))&(pd.isnull(df.loc[i,'2014']))&(pd.notnull(df.loc[i,'2013'])):
nb_data_recup_2013 +=1
if(pd.isnull(df.loc[i,'Last_data']))&(pd.isnull(df.loc[i,'2014']))&(pd.isnull(df.loc[i,'2013']))&(pd.notnull(df.loc[i,'2012'])):
nb_data_recup_2012 +=1
if(pd.isnull(df.loc[i,'Last_data']))&(pd.isnull(df.loc[i,'2014']))&(pd.isnull(df.loc[i,'2013']))&(pd.isnull(df.loc[i,'2012']))&(pd.notnull(df.loc[i,'2011'])):
nb_data_recup_2011 +=1
if(pd.isnull(df.loc[i,'Last_data']))&(pd.isnull(df.loc[i,'2014']))&(pd.isnull(df.loc[i,'2013']))&(pd.isnull(df.loc[i,'2012']))&(pd.isnull(df.loc[i,'2011']))&(pd.notnull(df.loc[i,'2010'])):
nb_data_recup_2010 +=1
nb_restant = nb_nan_LData - (nb_data_recup_2014 + nb_data_recup_2013 + nb_data_recup_2012 + nb_data_recup_2011 + nb_data_recup_2010)
print("Last_Data données manquantes : ",nb_nan_LData)
print("----------------------------------------------")
print("Origines des dernières données : ")
print("2014 : ",nb_data_recup_2014)
print("2013 : ",nb_data_recup_2013)
print("2012 : ",nb_data_recup_2012)
print("2011 : ",nb_data_recup_2011)
print("2010 : ",nb_data_recup_2010)
print("restant : ",nb_restant)
Last_Data données manquantes : 102 ---------------------------------------------- Origines des dernières données : 2014 : 55 2013 : 20 2012 : 15 2011 : 8 2010 : 4 restant : 0
# On comble avec les dernières valeurs mais l'analyse sera biasée surtout si on fait de la prospective
for i in range(len(df)):
# Tester si nan
if(pd.isnull(df.loc[i,'Last_data'])):
nb_nan_LData +=1
if(pd.isnull(df.loc[i,'Last_data']))&(pd.notnull(df.loc[i,'2014'])):
df.loc[i,'Last_data'] = df.loc[i,'2014']
if(pd.isnull(df.loc[i,'Last_data']))&(pd.isnull(df.loc[i,'2014']))&(pd.notnull(df.loc[i,'2013'])):
df.loc[i,'Last_data'] = df.loc[i,'2013']
if(pd.isnull(df.loc[i,'Last_data']))&(pd.isnull(df.loc[i,'2014']))&(pd.isnull(df.loc[i,'2013']))&(pd.notnull(df.loc[i,'2012'])):
df.loc[i,'Last_data'] = df.loc[i,'2012']
if(pd.isnull(df.loc[i,'Last_data']))&(pd.isnull(df.loc[i,'2014']))&(pd.isnull(df.loc[i,'2013']))&(pd.isnull(df.loc[i,'2012']))&(pd.notnull(df.loc[i,'2011'])):
df.loc[i,'Last_data'] = df.loc[i,'2011']
if(pd.isnull(df.loc[i,'Last_data']))&(pd.isnull(df.loc[i,'2014']))&(pd.isnull(df.loc[i,'2013']))&(pd.isnull(df.loc[i,'2012']))&(pd.isnull(df.loc[i,'2011']))&(pd.notnull(df.loc[i,'2010'])):
df.loc[i,'Last_data'] = df.loc[i,'2010']
# La colonne Last_data est complete
df.Last_data.isnull().mean()*100
0.0
print(f"{indicateurs_retenus}")
['IT.NET.USER.P2', 'NY.GNP.PCAP.PP.CD', 'SP.POP.1524.TO.UN', 'SE.SEC.ENRR', 'SE.TER.ENRR']
#On crée 1 df par indicateur
df_IT = df[df['Indicator Code'] == 'IT.NET.USER.P2']
df_ECO = df[df['Indicator Code'] == 'NY.GNP.PCAP.PP.CD']
df_POP = df[df['Indicator Code'] == 'SP.POP.1524.TO.UN']
df_SEC = df[df['Indicator Code'] == 'SE.SEC.ENRR']
df_TER = df[df['Indicator Code'] == 'SE.TER.ENRR']
col = ['Country Name','Country Code','Region','Indicator Name','Indicator Code','Income Group','Last_data']
# On ne garde en date que last_data
df_IT = df_IT.loc[:,col]
df_ECO = df_ECO.loc[:,col]
df_POP = df_POP.loc[:,col]
df_SEC = df_SEC.loc[:,col]
df_TER = df_TER.loc[:,col]
# Representation graphique des outliers:
fig = plt.figure(figsize=(20,20))
plt.subplot(5,2,1) # maillage des subplot
# Internet users (per 100 people)
sns.boxplot(data = df_IT, x='Indicator Code', y='Last_data',width=0.5, color='SteelBlue',showmeans=True,meanprops={"marker":"o",
"markerfacecolor":"red",
"markeredgecolor":"black",
"markersize":"5"})
plt.title('Utilisateurs internet',fontweight='bold',size=15)
plt.ylabel('Nombre utilisateur internet en %')
plt.subplot(5,2,2)
sns.histplot(df_IT['Last_data'], kde = True, color='SteelBlue')
plt.title('Internet users (per 100 people)',fontweight='bold',size=15)
plt.xlabel('IT.NET.USER.P2')
plt.grid(False)
plt.subplot(5,2,3) # maillage des subplot
# GNI per capita, PPP (current international $)
sns.boxplot(data = df_ECO, x='Indicator Code', y='Last_data', width=0.5, color='SteelBlue',showmeans=True,meanprops={"marker":"o",
"markerfacecolor":"red",
"markeredgecolor":"black",
"markersize":"5"})
plt.title('Economie',fontweight='bold',size=15)
plt.ylabel('GNI per capita, PPP (current international $)')
plt.subplot(5,2,4)
sns.histplot(df_ECO['Last_data'], kde = True, color='SteelBlue')
plt.title('GNI per capita, PPP (current international $)',fontweight='bold',size=15)
plt.xlabel('NY.GNP.PCAP.PP.CD')
plt.grid(False)
plt.subplot(5,2,5) # maillage des subplot
# Population, ages 15-24, total
sns.boxplot(data = df_POP, x='Indicator Code', y='Last_data', width=0.5, color='SteelBlue',showmeans=True,meanprops={"marker":"o",
"markerfacecolor":"red",
"markeredgecolor":"black",
"markersize":"7"})
plt.title('Population 15-24 ans',fontweight='bold',size=15)
plt.ylabel('Nombre de 15-24 ans')
plt.subplot(5,2,6)
sns.histplot(df_POP['Last_data'], kde = True, color='SteelBlue')
plt.title('Population, ages 15-24, total',fontweight='bold',size=15)
plt.xlabel('SP.POP.1524.TO.UN')
plt.grid(False)
plt.subplot(5,2,7) # maillage des subplot
# Enrolment in secondary education, both sexes (number)
sns.boxplot(data = df_SEC, x='Indicator Code', y='Last_data', width=0.5, color='SteelBlue',showmeans=True,meanprops={"marker":"o",
"markerfacecolor":"red",
"markeredgecolor":"black",
"markersize":"5"})
plt.title('Secondaire',fontweight='bold',size=15)
plt.ylabel('Nombre population dans le secondaire')
plt.subplot(5,2,8)
sns.histplot(df_SEC['Last_data'], kde = True, color='SteelBlue')
plt.title('Gross enrolment ratio, seconddary, both sexes (%)',fontweight='bold',size=15)
plt.xlabel('SE.SEC.ENRR')
plt.grid(False)
plt.subplot(5,2,9) # maillage des subplot
# Enrolment in tertiary education, all programmes, both sexes (number)
sns.boxplot(data = df_TER, x='Indicator Code', y='Last_data', width=0.5, color='SteelBlue',showmeans=True,meanprops={"marker":"o",
"markerfacecolor":"red",
"markeredgecolor":"black",
"markersize":"5"})
plt.title('Post Bac',fontweight='bold',size=15)
plt.ylabel('Nombre population en Post-Bac')
plt.subplot(5,2,10)
sns.histplot(df_TER['Last_data'], kde = True, color='SteelBlue')
plt.title('Gross enrolment ratio, tertiary, both sexes (%)',fontweight='bold',size=15)
plt.xlabel('SE.TER.ENRR')
plt.grid(False)
plt.subplots_adjust(left=0.125, # gerer les espacements
bottom=0.1,
right=0.9,
top=0.9,
wspace=0.4,
hspace=0.4)
plt.savefig("assets/graphiques/PREPROCESSING_2_Mesures distributions-1.jpg", bbox_inches='tight')
plt.show()
# Mesures de tendcances centrales
# Moyennes
mean_IT =df_IT['Last_data'].mean()
mean_ECO=df_ECO['Last_data'].mean()
mean_SEC=df_SEC['Last_data'].mean()
mean_TER=df_TER['Last_data'].mean()
mean_POP=df_POP['Last_data'].mean()
# Médianes
median_IT =df_IT['Last_data'].median()
median_ECO=df_ECO['Last_data'].median()
median_SEC=df_SEC['Last_data'].median()
median_TER=df_TER['Last_data'].median()
median_POP=df_POP['Last_data'].median()
# Mode
# mode_IT =df_IT['Last_data'].mode()
# mode_ECO=df_ECO['Last_data'].mode()
# mode_SEC=df_SEC['Last_data'].mode()
# mode_TER=df_TER['Last_data'].mode()
# mode_POP=df_POP['Last_data'].mode()
# Variance
var_IT =df_IT['Last_data'].var(ddof=0)
var_ECO=df_ECO['Last_data'].var(ddof=0)
var_SEC=df_SEC['Last_data'].var(ddof=0)
var_TER=df_TER['Last_data'].var(ddof=0)
var_POP=df_POP['Last_data'].var(ddof=0)
# Ecart type
std_IT =df_IT['Last_data'].std(ddof=0)
std_ECO=df_ECO['Last_data'].std(ddof=0)
std_SEC=df_SEC['Last_data'].std(ddof=0)
std_TER=df_TER['Last_data'].std(ddof=0)
std_POP=df_POP['Last_data'].std(ddof=0)
# Les données suivent une loi normal si : skew=0 et Kurtosis =3
# skew : coefficient d'assymétrie (0 => symétrique)
skew_IT =df_IT['Last_data'].skew()
skew_ECO=df_ECO['Last_data'].skew()
skew_SEC=df_SEC['Last_data'].skew()
skew_TER=df_TER['Last_data'].skew()
skew_POP=df_POP['Last_data'].skew()
# Kortosis : coefficient d'applatissement proche de 3 suit loi normale
kurtosis_IT =df_IT['Last_data'].kurtosis()
kurtosis_ECO=df_ECO['Last_data'].kurtosis()
kurtosis_SEC=df_SEC['Last_data'].kurtosis()
kurtosis_TER=df_TER['Last_data'].kurtosis()
kurtosis_POP=df_POP['Last_data'].kurtosis()
# Visualisation des indicateurs
# On crée une liste des tendances centrales
stats = [['mean',mean_IT, mean_ECO, mean_SEC, mean_TER, mean_POP],
['median', median_IT, median_ECO, median_SEC,median_TER,median_POP],
['var',var_IT, var_ECO, var_SEC,var_TER,var_POP],
['std',std_IT, std_ECO,std_SEC, std_TER, std_POP],
['skew', skew_IT, skew_ECO, skew_SEC, skew_TER, skew_POP],
['kurtosis', kurtosis_IT, kurtosis_ECO, kurtosis_SEC, kurtosis_TER, kurtosis_POP]]
# On crée un dataframe et des colonnes
df_stats = pd.DataFrame(stats,columns=['Statistique','IT_last_data','ECO_last_data','SEC_last_data','TER_last_data','POP_last_data'])
df_stats
| Statistique | IT_last_data | ECO_last_data | SEC_last_data | TER_last_data | POP_last_data | |
|---|---|---|---|---|---|---|
| 0 | mean | 48.462239 | 1.998679e+04 | 83.988968 | 40.205176 | 8.038740e+06 |
| 1 | median | 50.219659 | 1.201000e+04 | 91.317413 | 36.794924 | 1.703510e+06 |
| 2 | var | 815.902838 | 4.648906e+08 | 893.296681 | 831.019710 | 7.317407e+14 |
| 3 | std | 28.564013 | 2.156132e+04 | 29.888069 | 28.827412 | 2.705071e+07 |
| 4 | skew | -0.009762 | 1.885302e+00 | -0.250500 | 0.367150 | 7.317494e+00 |
| 5 | kurtosis | -1.305362 | 4.559860e+00 | -0.340264 | -1.114788 | 5.761620e+01 |
# Création df avec tous les indicateurs en colonne
df_indicateurs_retenus = df.pivot_table(index=['Country Name'],columns='Indicator Code')['Last_data']
df_indicateurs_retenus.head()
| Indicator Code | IT.NET.USER.P2 | NY.GNP.PCAP.PP.CD | SE.SEC.ENRR | SE.TER.ENRR | SP.POP.1524.TO.UN |
|---|---|---|---|---|---|
| Country Name | |||||
| Afghanistan | 8.260000 | 1880.0 | 55.644409 | 8.662800 | 7252785.0 |
| Albania | 63.252933 | 11340.0 | 95.765488 | 58.109951 | 556269.0 |
| Algeria | 38.200000 | 14170.0 | 99.860191 | 36.922279 | 6467818.0 |
| Angola | 12.400000 | 6270.0 | 28.898720 | 9.308020 | 4259352.0 |
| Argentina | 68.043064 | 19990.0 | 106.777901 | 82.917389 | 6886530.0 |
df_indicateurs_retenus.describe()
| Indicator Code | IT.NET.USER.P2 | NY.GNP.PCAP.PP.CD | SE.SEC.ENRR | SE.TER.ENRR | SP.POP.1524.TO.UN |
|---|---|---|---|---|---|
| count | 140.000000 | 140.000000 | 140.000000 | 140.000000 | 1.400000e+02 |
| mean | 48.462239 | 19986.785714 | 83.988968 | 40.205176 | 8.038740e+06 |
| std | 28.666577 | 21638.740547 | 29.995388 | 28.930922 | 2.714784e+07 |
| min | 1.083733 | 670.000000 | 17.378010 | 0.797730 | 2.353600e+04 |
| 25% | 21.845365 | 3977.500000 | 60.991915 | 12.270108 | 5.681078e+05 |
| 50% | 50.219659 | 12010.000000 | 91.317413 | 36.794924 | 1.703510e+06 |
| 75% | 72.949675 | 28492.500000 | 102.481300 | 64.801764 | 5.751938e+06 |
| max | 98.200000 | 125000.000000 | 166.808472 | 113.871788 | 2.441202e+08 |
# Identifier les outlier ECONOMIE
df_ECO.loc[df_ECO['Last_data']>60000]
| Country Name | Country Code | Region | Indicator Name | Indicator Code | Income Group | Last_data | |
|---|---|---|---|---|---|---|---|
| 85 | Brunei Darussalam | BRN | East Asia & Pacific | GNI per capita, PPP (current international $) | NY.GNP.PCAP.PP.CD | High income: nonOECD | 83600.0 |
| 350 | Kuwait | KWT | Middle East & North Africa | GNI per capita, PPP (current international $) | NY.GNP.PCAP.PP.CD | High income: nonOECD | 81150.0 |
| 390 | Luxembourg | LUX | Europe & Central Asia | GNI per capita, PPP (current international $) | NY.GNP.PCAP.PP.CD | High income: OECD | 68780.0 |
| 395 | Macao SAR, China | MAC | East Asia & Pacific | GNI per capita, PPP (current international $) | NY.GNP.PCAP.PP.CD | High income: nonOECD | 98650.0 |
| 495 | Norway | NOR | Europe & Central Asia | GNI per capita, PPP (current international $) | NY.GNP.PCAP.PP.CD | High income: OECD | 65180.0 |
| 535 | Qatar | QAT | Middle East & North Africa | GNI per capita, PPP (current international $) | NY.GNP.PCAP.PP.CD | High income: nonOECD | 125000.0 |
| 615 | Switzerland | CHE | Europe & Central Asia | GNI per capita, PPP (current international $) | NY.GNP.PCAP.PP.CD | High income: OECD | 63930.0 |
# Identifier les outliers POP
df_POP.loc[df_POP['Last_data']>mean_POP].sort_values(ascending=False,by='Last_data')
| Country Name | Country Code | Region | Indicator Name | Indicator Code | Income Group | Last_data | |
|---|---|---|---|---|---|---|---|
| 299 | India | IND | South Asia | Population, ages 15-24, total | SP.POP.1524.TO.UN | Lower middle income | 244120201.0 |
| 134 | China | CHN | East Asia & Pacific | Population, ages 15-24, total | SP.POP.1524.TO.UN | Upper middle income | 197026759.0 |
| 679 | United States | USA | North America | Population, ages 15-24, total | SP.POP.1524.TO.UN | High income: OECD | 45147517.0 |
| 304 | Indonesia | IDN | East Asia & Pacific | Population, ages 15-24, total | SP.POP.1524.TO.UN | Lower middle income | 41819264.0 |
| 504 | Pakistan | PAK | South Asia | Population, ages 15-24, total | SP.POP.1524.TO.UN | Lower middle income | 39080811.0 |
| 494 | Nigeria | NGA | Sub-Saharan Africa | Population, ages 15-24, total | SP.POP.1524.TO.UN | Lower middle income | 36659023.0 |
| 49 | Bangladesh | BGD | South Asia | Population, ages 15-24, total | SP.POP.1524.TO.UN | Low income | 34400594.0 |
| 84 | Brazil | BRA | Latin America & Caribbean | Population, ages 15-24, total | SP.POP.1524.TO.UN | Upper middle income | 33595574.0 |
| 219 | Ethiopia | ETH | Sub-Saharan Africa | Population, ages 15-24, total | SP.POP.1524.TO.UN | Low income | 21074230.0 |
| 444 | Mexico | MEX | Latin America & Caribbean | Population, ages 15-24, total | SP.POP.1524.TO.UN | Upper middle income | 20336734.0 |
| 524 | Philippines | PHL | East Asia & Pacific | Population, ages 15-24, total | SP.POP.1524.TO.UN | Lower middle income | 19278458.0 |
| 149 | Congo, Dem. Rep. | COD | Sub-Saharan Africa | Population, ages 15-24, total | SP.POP.1524.TO.UN | Low income | 15535095.0 |
| 199 | Egypt, Arab Rep. | EGY | Middle East & North Africa | Population, ages 15-24, total | SP.POP.1524.TO.UN | Lower middle income | 15377746.0 |
| 549 | Russian Federation | RUS | Europe & Central Asia | Population, ages 15-24, total | SP.POP.1524.TO.UN | High income: nonOECD | 14661984.0 |
| 654 | Turkey | TUR | Europe & Central Asia | Population, ages 15-24, total | SP.POP.1524.TO.UN | Upper middle income | 13930222.0 |
| 309 | Iran, Islamic Rep. | IRN | Middle East & North Africa | Population, ages 15-24, total | SP.POP.1524.TO.UN | Upper middle income | 13556577.0 |
| 334 | Japan | JPN | East Asia & Pacific | Population, ages 15-24, total | SP.POP.1524.TO.UN | High income: OECD | 12157426.0 |
| 584 | South Africa | ZAF | Sub-Saharan Africa | Population, ages 15-24, total | SP.POP.1524.TO.UN | Upper middle income | 9998431.0 |
| 629 | Tanzania | TZA | Sub-Saharan Africa | Population, ages 15-24, total | SP.POP.1524.TO.UN | Low income | 9849521.0 |
| 604 | Sudan | SDN | Sub-Saharan Africa | Population, ages 15-24, total | SP.POP.1524.TO.UN | Lower middle income | 9381489.0 |
| 634 | Thailand | THA | East Asia & Pacific | Population, ages 15-24, total | SP.POP.1524.TO.UN | Upper middle income | 9149718.0 |
| 139 | Colombia | COL | Latin America & Caribbean | Population, ages 15-24, total | SP.POP.1524.TO.UN | Upper middle income | 8999338.0 |
| 469 | Myanmar | MMR | East Asia & Pacific | Population, ages 15-24, total | SP.POP.1524.TO.UN | Low income | 8786817.0 |
| 244 | Germany | DEU | Europe & Central Asia | Population, ages 15-24, total | SP.POP.1524.TO.UN | High income: OECD | 8682394.0 |
| 664 | Uganda | UGA | Sub-Saharan Africa | Population, ages 15-24, total | SP.POP.1524.TO.UN | Low income | 8244246.0 |
# Identifier les outlier SEC
df_SEC.loc[df_SEC['Last_data']>150]
| Country Name | Country Code | Region | Indicator Name | Indicator Code | Income Group | Last_data | |
|---|---|---|---|---|---|---|---|
| 61 | Belgium | BEL | Europe & Central Asia | Gross enrolment ratio, secondary, both sexes (%) | SE.SEC.ENRR | High income: OECD | 166.808472 |
df_SEC_100 = df[df['Indicator Code'] == 'SE.SEC.ENRR']
df_TER_100 = df[df['Indicator Code'] == 'SE.TER.ENRR']
# On mets toutes les valeurs >100% a 100
df_SEC_100.Last_data[df_SEC_100.Last_data>100] = 100
df_TER_100.Last_data[df_TER_100.Last_data>100] = 100
# Representation graphique
fig = plt.figure(figsize=(20,20))
plt.subplot(5,2,1) # maillage des subplot
# Internet users (per 100 people)
sns.boxplot(data = df_IT, x='Indicator Code', y='Last_data',width=0.5, color='SteelBlue',showmeans=True,meanprops={"marker":"o",
"markerfacecolor":"red",
"markeredgecolor":"black",
"markersize":"5"})
plt.title('Utilisateurs internet',fontweight='bold',size=15)
plt.ylabel('Nombre utilisateur internet en %')
plt.subplot(5,2,2)
sns.histplot(df_IT['Last_data'], kde = True, color='SteelBlue')
plt.title('Internet users (per 100 people)',fontweight='bold',size=15)
plt.xlabel('IT.NET.USER.P2')
plt.grid(False)
plt.subplot(5,2,3) # maillage des subplot
# GNI per capita, PPP (current international $)
sns.boxplot(data = df_ECO, x='Indicator Code', y='Last_data', width=0.5, color='SteelBlue',showmeans=True,meanprops={"marker":"o",
"markerfacecolor":"red",
"markeredgecolor":"black",
"markersize":"5"})
plt.title('Economie',fontweight='bold',size=15)
plt.ylabel('GNI per capita, PPP (current international $)')
plt.subplot(5,2,4)
sns.histplot(df_ECO['Last_data'], kde = True, color='SteelBlue')
plt.title('GNI per capita, PPP (current international $)',fontweight='bold',size=15)
plt.xlabel('NY.GNP.PCAP.PP.CD')
plt.grid(False)
plt.subplot(5,2,5) # maillage des subplot
# Population, ages 15-24, total
sns.boxplot(data = df_POP, x='Indicator Code', y='Last_data', width=0.5, color='SteelBlue',showmeans=True,meanprops={"marker":"o",
"markerfacecolor":"red",
"markeredgecolor":"black",
"markersize":"7"})
plt.title('Population 15-24 ans',fontweight='bold',size=15)
plt.ylabel('Nombre de 15-24 ans')
plt.subplot(5,2,6)
sns.histplot(df_POP['Last_data'], kde = True, color='SteelBlue')
plt.title('Population, ages 15-24, total',fontweight='bold',size=15)
plt.xlabel('SP.POP.1524.TO.UN')
plt.grid(False)
plt.subplot(5,2,7) # maillage des subplot
# Enrolment in secondary education, both sexes (number)
sns.boxplot(data = df_SEC_100, x='Indicator Code', y='Last_data', width=0.5, color='SteelBlue',showmeans=True,meanprops={"marker":"o",
"markerfacecolor":"red",
"markeredgecolor":"black",
"markersize":"5"})
plt.title('Secondaire',fontweight='bold',size=15)
plt.ylabel('Nombre population dans le secondaire')
plt.subplot(5,2,8)
sns.histplot(df_SEC_100['Last_data'], kde = True, color='SteelBlue')
plt.title('Gross enrolment ratio, seconddary, both sexes (%)',fontweight='bold',size=15)
plt.xlabel('SE.SEC.ENRR')
plt.grid(False)
plt.subplot(5,2,9) # maillage des subplot
# Enrolment in tertiary education, all programmes, both sexes (number)
sns.boxplot(data = df_TER_100, x='Indicator Code', y='Last_data', width=0.5, color='SteelBlue',showmeans=True,meanprops={"marker":"o",
"markerfacecolor":"red",
"markeredgecolor":"black",
"markersize":"5"})
plt.title('Post Bac',fontweight='bold',size=15)
plt.ylabel('Nombre population en Post-Bac')
plt.subplot(5,2,10)
sns.histplot(df_TER_100['Last_data'], kde = True, color='SteelBlue')
plt.title('Gross enrolment ratio, tertiary, both sexes (%)',fontweight='bold',size=15)
plt.xlabel('SE.TER.ENRR')
plt.grid(False)
plt.subplots_adjust(left=0.125, # gerer les espacements
bottom=0.1,
right=0.9,
top=0.9,
wspace=0.4,
hspace=0.4)
plt.savefig("assets/graphiques/PREPROCESSING_3_Mesures distributions-2.jpg", bbox_inches='tight')
plt.show()
# Mesures de tendcances centrales
# Moyennes
mean_IT =df_IT['Last_data'].mean()
mean_ECO=df_ECO['Last_data'].mean()
mean_SEC_100=df_SEC_100['Last_data'].mean()
mean_TER_100=df_TER_100['Last_data'].mean()
mean_POP=df_POP['Last_data'].mean()
# Médianes
median_IT =df_IT['Last_data'].median()
median_ECO=df_ECO['Last_data'].median()
median_SEC_100=df_SEC_100['Last_data'].median()
median_TER_100=df_TER_100['Last_data'].median()
median_POP=df_POP['Last_data'].median()
# Mode
# mode_IT =df_IT['Last_data'].mode()
# mode_ECO=df_ECO['Last_data'].mode()
# mode_SEC=df_SEC['Last_data'].mode()
# mode_TER=df_TER['Last_data'].mode()
# mode_POP=df_POP['Last_data'].mode()
# Variance
var_IT =df_IT['Last_data'].var(ddof=0)
var_ECO=df_ECO['Last_data'].var(ddof=0)
var_SEC_100=df_SEC_100['Last_data'].var(ddof=0)
var_TER_100=df_TER_100['Last_data'].var(ddof=0)
var_POP=df_POP['Last_data'].var(ddof=0)
# Ecart type
std_IT =df_IT['Last_data'].std(ddof=0)
std_ECO=df_ECO['Last_data'].std(ddof=0)
std_SEC_100=df_SEC_100['Last_data'].std(ddof=0)
std_TER_100=df_TER_100['Last_data'].std(ddof=0)
std_POP=df_POP['Last_data'].std(ddof=0)
# Les données suivent une loi normal si : skew=0 et Kurtosis =3
# skew : coefficient d'assymétrie (0 => symétrique)
skew_IT =df_IT['Last_data'].skew()
skew_ECO=df_ECO['Last_data'].skew()
skew_SEC_100=df_SEC_100['Last_data'].skew()
skew_TER_100=df_TER_100['Last_data'].skew()
skew_POP=df_POP['Last_data'].skew()
# Kortosis : coefficient d'applatissement proche de 3 suit loi normale
kurtosis_IT =df_IT['Last_data'].kurtosis()
kurtosis_ECO=df_ECO['Last_data'].kurtosis()
kurtosis_SEC_100=df_SEC_100['Last_data'].kurtosis()
kurtosis_TER_100=df_TER_100['Last_data'].kurtosis()
kurtosis_POP=df_POP['Last_data'].kurtosis()
# Visualisation des indicateurs
# On crée une liste des tendances centrales
stats = [['mean',mean_IT, mean_ECO, mean_SEC_100, mean_TER_100, mean_POP],
['median', median_IT, median_ECO, median_SEC_100,median_TER_100,median_POP],
['var',var_IT, var_ECO, var_SEC_100,var_TER_100,var_POP],
['std',std_IT, std_ECO,std_SEC_100, std_TER_100, std_POP],
['skew', skew_IT, skew_ECO, skew_SEC_100, skew_TER_100, skew_POP],
['kurtosis', kurtosis_IT, kurtosis_ECO, kurtosis_SEC_100, kurtosis_TER_100, kurtosis_POP]]
# On crée un dataframe et des colonnes
df_stats_100 = pd.DataFrame(stats,columns=['Statistique','IT_last_data','ECO_last_data','SEC_last_data','TER_last_data','POP_last_data'])
df_stats_100
| Statistique | IT_last_data | ECO_last_data | SEC_last_data | TER_last_data | POP_last_data | |
|---|---|---|---|---|---|---|
| 0 | mean | 48.462239 | 1.998679e+04 | 79.485614 | 40.106092 | 8.038740e+06 |
| 1 | median | 50.219659 | 1.201000e+04 | 91.317413 | 36.794924 | 1.703510e+06 |
| 2 | var | 815.902838 | 4.648906e+08 | 595.571429 | 817.785972 | 7.317407e+14 |
| 3 | std | 28.564013 | 2.156132e+04 | 24.404332 | 28.596957 | 2.705071e+07 |
| 4 | skew | -0.009762 | 1.885302e+00 | -0.955947 | 0.329185 | 7.317494e+00 |
| 5 | kurtosis | -1.305362 | 4.559860e+00 | -0.461762 | -1.231056 | 5.761620e+01 |
# On remet les indicateurs à 100%
df_indicateurs_retenus.loc[df_indicateurs_retenus['SE.SEC.ENRR']>100] = 100
df_indicateurs_retenus.loc[df_indicateurs_retenus['SE.TER.ENRR']>100] = 100
df_indicateurs_retenus.describe()
| Indicator Code | IT.NET.USER.P2 | NY.GNP.PCAP.PP.CD | SE.SEC.ENRR | SE.TER.ENRR | SP.POP.1524.TO.UN |
|---|---|---|---|---|---|
| count | 140.000000 | 140.000000 | 140.000000 | 140.000000 | 1.400000e+02 |
| mean | 55.463510 | 8888.214286 | 79.485614 | 49.598897 | 7.102725e+06 |
| std | 35.729243 | 17730.777139 | 24.491960 | 38.694656 | 2.727416e+07 |
| min | 1.083733 | 100.000000 | 17.378010 | 0.797730 | 1.000000e+02 |
| 25% | 21.845365 | 100.000000 | 60.991915 | 12.270108 | 1.000000e+02 |
| 50% | 52.809558 | 3170.000000 | 91.317413 | 37.703526 | 5.968975e+05 |
| 75% | 100.000000 | 10617.500000 | 100.000000 | 100.000000 | 4.303251e+06 |
| max | 100.000000 | 125000.000000 | 100.000000 | 100.000000 | 2.441202e+08 |
df_indicateurs_cleaned = df_indicateurs_retenus.copy()
df_indicateurs_cleaned.describe()
| Indicator Code | IT.NET.USER.P2 | NY.GNP.PCAP.PP.CD | SE.SEC.ENRR | SE.TER.ENRR | SP.POP.1524.TO.UN |
|---|---|---|---|---|---|
| count | 140.000000 | 140.000000 | 140.000000 | 140.000000 | 1.400000e+02 |
| mean | 55.463510 | 8888.214286 | 79.485614 | 49.598897 | 7.102725e+06 |
| std | 35.729243 | 17730.777139 | 24.491960 | 38.694656 | 2.727416e+07 |
| min | 1.083733 | 100.000000 | 17.378010 | 0.797730 | 1.000000e+02 |
| 25% | 21.845365 | 100.000000 | 60.991915 | 12.270108 | 1.000000e+02 |
| 50% | 52.809558 | 3170.000000 | 91.317413 | 37.703526 | 5.968975e+05 |
| 75% | 100.000000 | 10617.500000 | 100.000000 | 100.000000 | 4.303251e+06 |
| max | 100.000000 | 125000.000000 | 100.000000 | 100.000000 | 2.441202e+08 |
Pas de valeurs aberrantes mais de fortes disparités entre les pays en raison de leurs tailles et de leurs revenus
# Fonction de visualisation des indicateurs normalisés pour fair la selection de la transormation la plus adéquat
def graph_norm(df,titre):
titre = "assets/graphiques/"+titre+".jpeg"
plt.figure(figsize=(10,5))
col=indicateurs_retenus
colors=['orange','Green','sienna','SteelBlue','Purple']
for i in range(5):
sns.kdeplot(df[col[i]],label=df.columns[i],color=colors[i])
plt.legend()
plt.grid(False)
plt.xlabel('')
# plt.savefig(titre)
plt.show()
# On normalise avec scit-learb
# from sklearn import preprocessing
cols = ['IT.NET.USER.P2','NY.GNP.PCAP.PP.CD','SE.SEC.ENRR','SE.TER.ENRR','SP.POP.1524.TO.UN']
StandardScaler sur les données cleaner SEC et TER a 100%
scaler = nltk.preprocessing.StandardScaler()
scaled = scaler.fit_transform(df_indicateurs_cleaned[cols])
df_StandardScaler = pd.DataFrame(scaled, columns=cols)
# df_StandardScaler.head()
graph_norm(df_StandardScaler,titre="NORMALISATION_1_StandardScaler_100")
df_norm = df_indicateurs_cleaned.copy()
# 'IT.NET.USER.P2', 'NY.GNP.PCAP.PP.CD', 'SP.POP.1524.TO.UN', 'SE.SEC.ENRR', 'SE.TER.ENRR'
indicateur = "NY.GNP.PCAP.PP.CD"
print(f"max : {df_norm.loc[:,[indicateur]].max()}")
print(f"min : {df_norm.loc[:,[indicateur]].min()}")
print(f"median : {df_norm.loc[:,[indicateur]].median()}")
max : Indicator Code NY.GNP.PCAP.PP.CD 125000.0 dtype: float64 min : Indicator Code NY.GNP.PCAP.PP.CD 100.0 dtype: float64 median : Indicator Code NY.GNP.PCAP.PP.CD 3170.0 dtype: float64
# 'IT.NET.USER.P2', 'NY.GNP.PCAP.PP.CD', 'SP.POP.1524.TO.UN', 'SE.SEC.ENRR', 'SE.TER.ENRR'
indicateur = "SP.POP.1524.TO.UN"
print(f"max : {df_norm.loc[:,[indicateur]].max()}")
print(f"min : {df_norm.loc[:,[indicateur]].min()}")
print(f"median : {df_norm.loc[:,[indicateur]].median()}")
max : Indicator Code SP.POP.1524.TO.UN 244120201.0 dtype: float64 min : Indicator Code SP.POP.1524.TO.UN 100.0 dtype: float64 median : Indicator Code SP.POP.1524.TO.UN 596897.5 dtype: float64
df_norm = df_norm.loc[df_norm['NY.GNP.PCAP.PP.CD']>5430.0]
df_norm = df_norm.loc[df_norm['SP.POP.1524.TO.UN']>596897.5]
df_norm.head()
| Indicator Code | IT.NET.USER.P2 | NY.GNP.PCAP.PP.CD | SE.SEC.ENRR | SE.TER.ENRR | SP.POP.1524.TO.UN |
|---|---|---|---|---|---|
| Country Name | |||||
| Algeria | 38.200000 | 14170.0 | 99.860191 | 36.922279 | 6467818.0 |
| Angola | 12.400000 | 6270.0 | 28.898720 | 9.308020 | 4259352.0 |
| Brazil | 58.327952 | 15310.0 | 99.650970 | 50.604919 | 33595574.0 |
| Bulgaria | 56.656300 | 17880.0 | 99.016350 | 73.934196 | 710294.0 |
| China | 50.300000 | 14420.0 | 94.299210 | 43.391769 | 197026759.0 |
scaler = nltk.preprocessing.StandardScaler()
scaled = scaler.fit_transform(df_norm[cols])
df_StandardScaler_norm = pd.DataFrame(scaled, columns=cols)
# df_StandardScaler_n.head()
graph_norm(df_StandardScaler_norm,titre="NORMALISATION_2_StandardScaler_norm")
scaler = nltk.preprocessing.PowerTransformer()
scaled = scaler.fit_transform(df_norm[cols])
df_PowerTransformer_norm = pd.DataFrame(scaled, columns=cols)
# df_StandardScaler_n.head()
graph_norm(df_PowerTransformer_norm,titre="NORMALISATION_3_PowerTransform_norm")
Mise en correspondance avec une distribution gaussienne
Dans de nombreux scénarios de modélisation, la normalité des caractéristiques d'un ensemble de données est souhaitable. Les transformées de puissance sont une famille de transformations paramétriques et monotones qui visent à faire correspondre les données de n'importe quelle distribution à une distribution aussi proche que possible de la distribution gaussienne afin de stabiliser la variance et de minimiser l'asymétrie.
df_indicateurs_nonCleaner = df.pivot_table(index=['Country Name'],columns='Indicator Code')['Last_data']
scaler = nltk.preprocessing.PowerTransformer(method = 'box-cox')
df_PowerTransformer = scaler.fit_transform(df_indicateurs_nonCleaner[cols])
df_PowerTransformer = pd.DataFrame(df_PowerTransformer, columns=cols)
df_PowerTransformer.head()
graph_norm(df_PowerTransformer,titre="NORMALISATION_3_PowerTransform_non_clean_OK")
Les statistiques descriptives des indicateurs ont montré la présence de courbes asymétriques (étalées à droite ou à gauche), les courbes de l'indicateur démographique et économique ont des observations plus concentrées et une variance élevée, avec des outliers.
On garde la normalisation de PowerTransformer pour le scoring
df_stats = df.copy()
# On supprime les anées 2010 2011 2012 2013 2014 2015
df_stats = df_stats.drop(['2010','2011','2012','2013','2014','2015'],axis=1)
#Oncrée les colone de base
a2 = pd.DataFrame({'Country Name' : df_stats['Country Name'],
'Country Code' : df_stats['Country Code'],
'Region' : df_stats['Region'],
'Income Group' : df_stats['Income Group']})
# On recupère des dernier données par indicateur
a3 = df_stats.pivot_table(index=['Country Code'],columns='Indicator Code')['Last_data']
a3.loc[a3['SE.SEC.ENRR']>100] = 100
a3.loc[a3['SE.TER.ENRR']>100] = 100
# On fusionne les deux pour avoir toutes les informations
df_stats = pd.merge(a2,a3,left_on="Country Code", right_on='Country Code', how='inner')
df_stats = df_stats.drop_duplicates()
df_stats.head()
| Country Name | Country Code | Region | Income Group | IT.NET.USER.P2 | NY.GNP.PCAP.PP.CD | SE.SEC.ENRR | SE.TER.ENRR | SP.POP.1524.TO.UN | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | AFG | South Asia | Low income | 8.260000 | 1880.0 | 55.644409 | 8.662800 | 7252785.0 |
| 5 | Albania | ALB | Europe & Central Asia | Upper middle income | 63.252933 | 11340.0 | 95.765488 | 58.109951 | 556269.0 |
| 10 | Algeria | DZA | Middle East & North Africa | Upper middle income | 38.200000 | 14170.0 | 99.860191 | 36.922279 | 6467818.0 |
| 15 | Angola | AGO | Sub-Saharan Africa | Upper middle income | 12.400000 | 6270.0 | 28.898720 | 9.308020 | 4259352.0 |
| 20 | Argentina | ARG | Latin America & Caribbean | Upper middle income | 100.000000 | 100.0 | 100.000000 | 100.000000 | 100.0 |
# On affiche les ordres de grandeur des indicateurs
df_stats.describe()
| IT.NET.USER.P2 | NY.GNP.PCAP.PP.CD | SE.SEC.ENRR | SE.TER.ENRR | SP.POP.1524.TO.UN | |
|---|---|---|---|---|---|
| count | 140.000000 | 140.000000 | 140.000000 | 140.000000 | 1.400000e+02 |
| mean | 55.463510 | 8888.214286 | 79.485614 | 49.598897 | 7.102725e+06 |
| std | 35.729243 | 17730.777139 | 24.491960 | 38.694656 | 2.727416e+07 |
| min | 1.083733 | 100.000000 | 17.378010 | 0.797730 | 1.000000e+02 |
| 25% | 21.845365 | 100.000000 | 60.991915 | 12.270108 | 1.000000e+02 |
| 50% | 52.809558 | 3170.000000 | 91.317413 | 37.703526 | 5.968975e+05 |
| 75% | 100.000000 | 10617.500000 | 100.000000 | 100.000000 | 4.303251e+06 |
| max | 100.000000 | 125000.000000 | 100.000000 | 100.000000 | 2.441202e+08 |
df_stats_region = df_stats.copy()
df_stats_region.head()
| Country Name | Country Code | Region | Income Group | IT.NET.USER.P2 | NY.GNP.PCAP.PP.CD | SE.SEC.ENRR | SE.TER.ENRR | SP.POP.1524.TO.UN | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | AFG | South Asia | Low income | 8.260000 | 1880.0 | 55.644409 | 8.662800 | 7252785.0 |
| 5 | Albania | ALB | Europe & Central Asia | Upper middle income | 63.252933 | 11340.0 | 95.765488 | 58.109951 | 556269.0 |
| 10 | Algeria | DZA | Middle East & North Africa | Upper middle income | 38.200000 | 14170.0 | 99.860191 | 36.922279 | 6467818.0 |
| 15 | Angola | AGO | Sub-Saharan Africa | Upper middle income | 12.400000 | 6270.0 | 28.898720 | 9.308020 | 4259352.0 |
| 20 | Argentina | ARG | Latin America & Caribbean | Upper middle income | 100.000000 | 100.0 | 100.000000 | 100.000000 | 100.0 |
for i in range(len(indicateurs_retenus)):
print('--------------------------------------------------------')
print(indicateurs_retenus[i])
print('--------------------------------------------------------')
print(df_stats_region.groupby('Region')[indicateurs_retenus[i]].describe(include=all))
--------------------------------------------------------
IT.NET.USER.P2
--------------------------------------------------------
count mean std min 25% \
Region
East Asia & Pacific 16.0 63.740673 33.526813 18.200000 22.744017
Europe & Central Asia 44.0 85.080086 24.608391 14.996775 70.235925
Latin America & Caribbean 21.0 58.495446 25.891286 26.802605 41.590000
Middle East & North Africa 14.0 66.553292 25.806430 24.085409 46.131191
North America 1.0 74.554202 NaN 74.554202 74.554202
South Asia 7.0 21.432908 10.988467 8.260000 14.200000
Sub-Saharan Africa 37.0 16.669778 12.904877 1.083733 7.459161
50% 75% max
Region
East Asia & Pacific 71.132034 100.000000 100.000000
Europe & Central Asia 100.000000 100.000000 100.000000
Latin America & Caribbean 53.810000 64.600000 100.000000
Middle East & North Africa 67.057191 89.044437 100.000000
North America 74.554202 74.554202 74.554202
South Asia 17.581618 27.994369 39.800000
Sub-Saharan Africa 12.400000 22.742818 51.919116
--------------------------------------------------------
NY.GNP.PCAP.PP.CD
--------------------------------------------------------
count mean std min \
Region
East Asia & Pacific 16.0 18923.750000 29947.449279 100.0
Europe & Central Asia 44.0 4573.181818 8358.485668 100.0
Latin America & Caribbean 21.0 9083.809524 6591.388682 100.0
Middle East & North Africa 14.0 23414.285714 35963.664508 100.0
North America 1.0 57900.000000 NaN 57900.0
South Asia 7.0 5481.428571 3323.750034 1880.0
Sub-Saharan Africa 37.0 3392.432432 3750.090670 670.0
25% 50% 75% max
Region
East Asia & Pacific 100.0 7205.0 17300.0 98650.0
Europe & Central Asia 100.0 100.0 5617.5 31430.0
Latin America & Caribbean 4290.0 8370.0 13590.0 20570.0
Middle East & North Africa 3942.5 10855.0 16780.0 125000.0
North America 57900.0 57900.0 57900.0 57900.0
South Asia 3035.0 5320.0 6800.0 11500.0
Sub-Saharan Africa 1500.0 1990.0 3700.0 19980.0
--------------------------------------------------------
SP.POP.1524.TO.UN
--------------------------------------------------------
count mean std min \
Region
East Asia & Pacific 16.0 1.757077e+07 4.909192e+07 100.0
Europe & Central Asia 44.0 4.676263e+05 1.188618e+06 100.0
Latin America & Caribbean 21.0 3.826213e+06 8.278347e+06 100.0
Middle East & North Africa 14.0 3.716807e+06 5.170341e+06 100.0
North America 1.0 4.514752e+07 NaN 45147517.0
South Asia 7.0 4.782893e+07 8.793947e+07 147444.0
Sub-Saharan Africa 37.0 4.873992e+06 6.976425e+06 38279.0
25% 50% 75% max
Region
East Asia & Pacific 100.0 398884.5 7039125.00 197026759.0
Europe & Central Asia 100.0 100.0 461100.75 5897703.0
Latin America & Caribbean 23536.0 536826.0 1962188.00 33595574.0
Middle East & North Africa 70972.5 1053006.5 6063574.75 15377746.0
North America 45147517.0 45147517.0 45147517.00 45147517.0
South Asia 4900342.0 7252785.0 36740702.50 244120201.0
Sub-Saharan Africa 978071.0 3009775.0 4945440.00 36659023.0
--------------------------------------------------------
SE.SEC.ENRR
--------------------------------------------------------
count mean std min 25% \
Region
East Asia & Pacific 16.0 88.672663 14.829773 51.296089 83.774380
Europe & Central Asia 44.0 97.535537 4.987859 79.294098 98.816381
Latin America & Caribbean 21.0 88.640386 11.284546 65.626701 79.425407
Middle East & North Africa 14.0 86.115675 15.910126 48.624660 83.359735
North America 1.0 97.564690 NaN 97.564690 97.564690
South Asia 7.0 69.822295 18.283007 44.525871 59.584469
Sub-Saharan Africa 37.0 47.683012 20.210481 17.378010 33.666660
50% 75% max
Region
East Asia & Pacific 95.189339 100.000000 100.000000
Europe & Central Asia 100.000000 100.000000 100.000000
Latin America & Caribbean 90.549690 99.650970 100.000000
Middle East & North Africa 90.169865 98.638306 100.000000
North America 97.564690 97.564690 97.564690
South Asia 67.169853 79.083447 99.724510
Sub-Saharan Africa 43.395191 56.811958 98.819672
--------------------------------------------------------
SE.TER.ENRR
--------------------------------------------------------
count mean std min 25% \
Region
East Asia & Pacific 16.0 59.140481 35.885345 13.528170 25.619302
Europe & Central Asia 44.0 83.347298 27.466859 7.984440 66.814675
Latin America & Caribbean 21.0 52.388614 32.196095 12.480620 27.220270
Middle East & North Africa 14.0 49.291286 31.134223 9.974600 29.759329
North America 1.0 85.795776 NaN 85.795776 85.795776
South Asia 7.0 14.938481 6.440884 8.662800 10.427160
Sub-Saharan Africa 37.0 9.451695 6.853559 0.797730 4.775910
50% 75% max
Region
East Asia & Pacific 55.979393 100.000000 100.000000
Europe & Central Asia 100.000000 100.000000 100.000000
Latin America & Caribbean 40.512791 91.147743 100.000000
Middle East & North Africa 37.703526 65.766324 100.000000
North America 85.795776 85.795776 85.795776
South Asia 13.440800 17.368285 26.874880
Sub-Saharan Africa 8.433270 10.849330 36.667568
fig = plt.figure(figsize=(20,20))
fig.suptitle(' Ordres de grandeurs par indicateur et par region (en moyenne)',fontweight='bold', fontsize=20)
plt.subplot(5,2,1) # maillage des subplot
plt.title('Accès internet : IT.NET.USER.P2',fontweight='bold',size=15)
df_stats_region.groupby('Region')['IT.NET.USER.P2'].mean().sort_values(ascending=True).plot(kind='barh',width=0.8)
plt.grid(False)
plt.subplot(5,2,2) # maillage des subplot
plt.title("Economie : NY.GNP.PCAP.PP.CD",fontweight='bold',size=15)
df_stats_region.groupby('Region')['NY.GNP.PCAP.PP.CD'].mean().sort_values(ascending=True).plot(kind='barh',width=0.8)
plt.grid(False)
plt.subplot(5,2,3) # maillage des subplot
plt.title("Population 15-20 ans : SP.POP.1524.TO.UN",fontweight='bold',size=15)
df_stats_region.groupby('Region')['SP.POP.1524.TO.UN'].mean().sort_values(ascending=True).plot(kind='barh',width=0.8)
plt.grid(False)
plt.subplot(5,2,4) # maillage des subplot
plt.title("Enseignement Secondaire : SE.SEC.ENRR",fontweight='bold',size=15)
df_stats_region.groupby('Region')['SE.SEC.ENRR'].mean().sort_values(ascending=True).plot(kind='barh',width=0.8)
plt.grid(False)
plt.subplot(5,2,5) # maillage des subplot
plt.title("Post Bac : SE.TER.ENRR",fontweight='bold',size=15)
df_stats_region.groupby('Region')['SE.TER.ENRR'].mean().sort_values(ascending=True).plot(kind='barh',width=0.8)
width=1
plt.grid(False)
plt.subplots_adjust(left=0.125, # gerer les espacements
bottom=0.1,
right=0.9,
top=0.93,
wspace=0.4,
hspace=0.4
)
plt.savefig("assets/graphiques/PREPROCESSING_4_Ordre de grandeurs.jpg", bbox_inches='tight')
plt.show()
df_score = df_PowerTransformer
df_score['Pays'] = df_indicateurs_retenus.index
On veut mettre en avant : les pays fortes connexion internet, forte population de 15-24 ans, suivant formation sup en priorité puis dans le secondaire et pouvant payer les services
| Indicator | code | Pondérations |
|---|---|---|
| IT | IT.NET.USER.P2 | 5 |
| POP 15-24 | SP.POP.1524.TO.UN | 4 |
| SUPERIEUR | SE.TER.ENRR | 3 |
| SECONDAIRE | SE.SEC.ENRR | 2 |
| ECONOMIE | NY.GNP.PCAP.PP.CD | 1 |
# On calcule et affecte un scoring
for i in range(len(df_score)):
df_score.loc[i,'Score'] = (round((df_score.loc[i,'IT.NET.USER.P2']*5)
+(df_score.loc[i,'SP.POP.1524.TO.UN']*4)
+ (df_score.loc[i,'SE.TER.ENRR']*3)
+ (df_score.loc[i,'SE.SEC.ENRR']*2)
+ (df_score.loc[i,'NY.GNP.PCAP.PP.CD']*1)/14,2)
)
# TOP 10
df_top_10 = df_score.sort_values(by='Score',ascending=False).head(10)
df_top_10
| IT.NET.USER.P2 | NY.GNP.PCAP.PP.CD | SE.SEC.ENRR | SE.TER.ENRR | SP.POP.1524.TO.UN | Pays | Score | |
|---|---|---|---|---|---|---|---|
| 135 | 0.904686 | 1.462074 | 0.438005 | 1.345030 | 1.876738 | United States | 17.05 |
| 134 | 1.385123 | 1.126017 | 1.506951 | 0.690003 | 0.852785 | United Kingdom | 15.50 |
| 6 | 1.184173 | 1.222933 | 1.859802 | 1.433485 | 0.298286 | Australia | 15.22 |
| 69 | 1.322187 | 0.957368 | 0.483686 | 1.488486 | 0.749742 | Korea, Rep. | 15.11 |
| 66 | 1.359969 | 1.150131 | 0.583276 | 0.858930 | 1.112840 | Japan | 15.08 |
| 117 | 1.021567 | 0.963910 | 1.578976 | 1.421164 | 0.516456 | Spain | 14.66 |
| 12 | 1.197643 | 1.225723 | 2.938225 | 1.122505 | -0.179942 | Belgium | 14.60 |
| 95 | 1.377759 | 1.310942 | 1.783850 | 1.196069 | 0.088065 | Netherlands | 14.49 |
| 109 | 0.871983 | 0.586230 | 0.678854 | 1.235472 | 1.220982 | Russian Federation | 14.35 |
| 48 | 1.266653 | 1.296742 | 0.615350 | 0.972800 | 0.919258 | Germany | 14.25 |
df_top_10 = df_top_10.reset_index()
df_top_10 = df_top_10.drop(['index'],axis=1)
liste_pays_retenus = df_top_10['Pays'].tolist()
df_score_norm = df_PowerTransformer_norm
df_score_norm['Pays'] = df_norm.index
for i in range(len(df_score_norm)):
df_score_norm.loc[i,'Score'] = (round((df_score_norm.loc[i,'IT.NET.USER.P2']*5)
+(df_score_norm.loc[i,'SP.POP.1524.TO.UN']*4)
+ (df_score_norm.loc[i,'SE.TER.ENRR']*3)
+ (df_score_norm.loc[i,'SE.SEC.ENRR']*2)
+ (df_score_norm.loc[i,'NY.GNP.PCAP.PP.CD']*1)/14,2)
)
# TOP 10
df_top_10_norm = df_score_norm.sort_values(by='Score',ascending=False).head(10)
df_top_10_norm
| IT.NET.USER.P2 | NY.GNP.PCAP.PP.CD | SE.SEC.ENRR | SE.TER.ENRR | SP.POP.1524.TO.UN | Pays | Score | |
|---|---|---|---|---|---|---|---|
| 15 | 2.184787 | 1.702272 | 1.175855 | 1.950842 | 0.260830 | Korea, Rep. | 20.29 |
| 33 | 1.447463 | 2.239240 | 1.035853 | 1.762995 | 1.331004 | United States | 20.08 |
| 2 | 0.641004 | 0.538406 | 1.259486 | 0.681782 | 1.192399 | Brazil | 12.58 |
| 4 | 0.235485 | 0.433473 | 0.706352 | 0.402809 | 1.918316 | China | 11.50 |
| 5 | 0.519118 | 0.326610 | 1.091381 | 0.862176 | 0.472196 | Colombia | 9.28 |
| 32 | 0.163468 | -0.820426 | 1.214275 | 1.670500 | 0.066830 | Ukraine | 8.47 |
| 13 | -0.017903 | 0.775908 | 0.236952 | 1.378093 | 0.715522 | Iran, Islamic Rep. | 7.44 |
| 19 | 0.595935 | 0.728257 | 0.357749 | -0.210547 | 0.938671 | Mexico | 6.87 |
| 25 | 0.407826 | -0.538515 | 0.170730 | 0.072569 | 0.910227 | Philippines | 6.20 |
| 27 | 1.598854 | 1.507631 | 0.532078 | 0.765949 | -1.627177 | Slovak Republic | 4.96 |
# On ajoute la region a df_score
df_score_region_sc = pd.merge(df_score,df_stats_region,left_on="Pays", right_on='Country Name', how='inner')
df_score_region_sc.columns
Index(['IT.NET.USER.P2_x', 'NY.GNP.PCAP.PP.CD_x', 'SE.SEC.ENRR_x',
'SE.TER.ENRR_x', 'SP.POP.1524.TO.UN_x', 'Pays', 'Score', 'Country Name',
'Country Code', 'Region', 'Income Group', 'IT.NET.USER.P2_y',
'NY.GNP.PCAP.PP.CD_y', 'SE.SEC.ENRR_y', 'SE.TER.ENRR_y',
'SP.POP.1524.TO.UN_y'],
dtype='object')
# On supprime les colonne des indicateurs non normalisés
df_score_region_sc = df_score_region_sc.drop(columns=['IT.NET.USER.P2_y', 'Country Name','Country Code',
'NY.GNP.PCAP.PP.CD_y', 'SE.SEC.ENRR_y', 'SE.TER.ENRR_y',
'SP.POP.1524.TO.UN_y'],axis=1)
df_score_income_sc = df_score_region_sc.copy()
# On renomme les colonnes
df_score_region_sc = df_score_region_sc.rename(columns={'IT.NET.USER.P2_x':'NET.USER',
'NY.GNP.PCAP.PP.CD_x':'ECO',
'SE.SEC.ENRR_x':'SEC',
'SE.TER.ENRR_x':'TER',
'SP.POP.1524.TO.UN_x':'POP'})
# On regroupe par region et on fait la moyenne des indicateurs par region
df_score_region_sc = df_score_region_sc.groupby('Region').mean()
df_score_region_sc = df_score_region_sc.sort_values(by='Score',ascending=False)
df_score_region_sc
| NET.USER | ECO | SEC | TER | POP | Score | |
|---|---|---|---|---|---|---|
| Region | ||||||
| North America | 0.904686 | 1.462074 | 0.438005 | 1.345030 | 1.876738 | 17.050000 |
| Europe & Central Asia | 0.767370 | 0.666468 | 0.827983 | 0.789956 | -0.263289 | 6.856818 |
| East Asia & Pacific | 0.296663 | 0.522339 | 0.327918 | 0.394410 | 0.278486 | 4.473125 |
| Middle East & North Africa | 0.509407 | 0.541269 | 0.084012 | 0.154570 | -0.069502 | 2.940000 |
| Latin America & Caribbean | 0.134427 | 0.043293 | 0.213622 | 0.263573 | -0.273448 | 0.798571 |
| South Asia | -0.915696 | -0.695377 | -0.492470 | -0.805644 | 0.993137 | -4.057143 |
| Sub-Saharan Africa | -1.161091 | -1.155767 | -1.198132 | -1.201977 | 0.135559 | -11.347838 |
# Import Pi
from math import pi
# Set data
df_spider = df_score_region_sc[['NET.USER','ECO','SEC','TER','POP']]
df_spider
| NET.USER | ECO | SEC | TER | POP | |
|---|---|---|---|---|---|
| Region | |||||
| North America | 0.904686 | 1.462074 | 0.438005 | 1.345030 | 1.876738 |
| Europe & Central Asia | 0.767370 | 0.666468 | 0.827983 | 0.789956 | -0.263289 |
| East Asia & Pacific | 0.296663 | 0.522339 | 0.327918 | 0.394410 | 0.278486 |
| Middle East & North Africa | 0.509407 | 0.541269 | 0.084012 | 0.154570 | -0.069502 |
| Latin America & Caribbean | 0.134427 | 0.043293 | 0.213622 | 0.263573 | -0.273448 |
| South Asia | -0.915696 | -0.695377 | -0.492470 | -0.805644 | 0.993137 |
| Sub-Saharan Africa | -1.161091 | -1.155767 | -1.198132 | -1.201977 | 0.135559 |
plt.rc('axes', facecolor = 'Gainsboro')
# Nombre de variables
categories=list(df_spider.columns)
# Nombre de catégorie = nombre de colonne = 5
N = len(categories)
# Pour angle on divise le plot par le nombre de variable-> 1/4 cercle -> *2 -> demi cercle * pi cercle entier
angles = [n / N*2*pi for n in range(N)]
# On fixe la taille de la figure
fig = plt.figure(1, figsize=(20,18))
# On choisit les colonnes
# colors = ['cadetblue', 'lightseagreen', 'darkseagreen', 'indianred', 'steelblue', 'RoyalBlue','lightcoral']
colors = ['seagreen','mediumseagreen','darkseagreen', 'lightseagreen', 'steelblue', 'coral','lightcoral']
# Boucle pour chacune des régions contenues dans l'index
for i, region in enumerate(df_spider.index) :
# On crée un sous plot
ax = fig.add_subplot(3,3,i+1, polar=True)
# If you want the first axis to be on top:
ax.set_theta_offset(pi/2)
ax.set_theta_direction(-1)
# Draw one axe per variable + add labels labels yet
plt.xticks(angles, categories, color='black', size=8)
# On supprime la visibilité des graduations circulaires
plt.yticks(color="gray", size=0.1)
# On definit values = contenue
values=df_spider.iloc[i].values
# On dessine angle, values, motif, taille du trait, style du trait
ax.plot(angles, values, '.',color='black', linewidth=0, linestyle='solid')
# On definit la visualisation de l'aire dessiné
ax.fill(angles, values, colors[i], alpha=0.6)
# On aggrandit les labels
ax.set_xticklabels(categories, fontsize=12)
# Titer des graphiques et couleurs
ax.set_title(region, size=20, color=colors[i])
# grille
plt.grid(True,color="grey")
# On definit l'étendu de y entre -2 et 2 pour une bonne visualisation
plt.ylim(-2,2)
plt.savefig("assets/graphiques/PREPROCESSING_5_Ordre de grandeurs_Regions.jpg", bbox_inches='tight')
Les pays sélectionnées sont dans les régions qui ont les aires les plus larges ==> confirme les choix
fig = plt.figure(1, figsize=(10,6))
res = sns.heatmap(df_spider, square=True, linewidths=0.1, cmap='coolwarm')
res.set_yticklabels(res.get_ymajorticklabels(), fontsize = 14)
res.set_xticklabels(res.get_xmajorticklabels(), fontsize = 12)
plt.title("Heatmap Indicateurs Region Score",fontsize=18,fontweight='bold')
# plt.savefig("assets/graphiques/PREPROCESSING_6_Heatmap_indicateurs_region.jpg")
plt.show()
df_score_income_sc.columns
Index(['IT.NET.USER.P2_x', 'NY.GNP.PCAP.PP.CD_x', 'SE.SEC.ENRR_x',
'SE.TER.ENRR_x', 'SP.POP.1524.TO.UN_x', 'Pays', 'Score', 'Region',
'Income Group'],
dtype='object')
df_score_income_sc=df_score_income_sc.groupby('Income Group').mean()
# On renomme les colonnes
df_score_income_sc = df_score_income_sc.rename(columns={'IT.NET.USER.P2_x':'NET.USER',
'NY.GNP.PCAP.PP.CD_x':'ECO',
'SE.SEC.ENRR_x':'SEC',
'SE.TER.ENRR_x':'TER',
'SP.POP.1524.TO.UN_x':'POP'})
df_score_income_sc.head()
| NET.USER | ECO | SEC | TER | POP | Score | |
|---|---|---|---|---|---|---|
| Income Group | ||||||
| High income: OECD | 1.129015 | 1.117205 | 1.082073 | 1.064266 | -0.095706 | 10.698667 |
| High income: nonOECD | 0.960337 | 1.146841 | 0.568577 | 0.610445 | -0.950219 | 4.050667 |
| Low income | -1.420378 | -1.423266 | -1.336054 | -1.302546 | 0.373721 | -12.287407 |
| Lower middle income | -0.498394 | -0.538922 | -0.387760 | -0.471286 | 0.124844 | -4.221429 |
| Upper middle income | 0.227833 | 0.199142 | 0.262248 | 0.320579 | 0.080742 | 2.962727 |
df_spider2 = df_score_income_sc[['NET.USER','ECO','SEC','TER','POP']]
plt.rc('axes', facecolor = 'Gainsboro')
# Nombre de variables
categories=list(df_spider2.columns)
# Nombre de catégorie = nombre de colonne = 5
N = len(categories)
# Pour angle on divise le plot par le nombre de variable-> 1/4 cercle -> *2 -> demi cercle * pi cercle entier
angles = [n / N*2*pi for n in range(N)]
# On fixe la taille de la figure
fig = plt.figure(1, figsize=(20,18))
# On choisit les colonnes
# colors = ['sienna', 'LightGreen', 'steelblue', 'RoyalBlue','Peru']
colors = ['seagreen','mediumseagreen','darkseagreen', 'lightseagreen', 'salmon']
# Boucle pour chacune des régions contenues dans l'index
for i, region in enumerate(df_spider2.index) :
# On crée un sous plot
ax = fig.add_subplot(3,3,i+1, polar=True)
# If you want the first axis to be on top:
ax.set_theta_offset(pi/2)
ax.set_theta_direction(-1)
# Draw one axe per variable + add labels labels yet
plt.xticks(angles, categories, color='black', size=8)
# On supprime la visibilité des graduations circulaires
plt.yticks(color="gray", size=0.1)
# On definit values = contenue
values=df_spider.iloc[i].values
# On dessine angle, values, motif, taille du trait, style du trait
ax.plot(angles, values, '.',color='black', linewidth=0, linestyle='solid')
# On definit la visualisation de l'aire dessiné
ax.fill(angles, values, colors[i], alpha=0.6)
# On aggrandit les labels
ax.set_xticklabels(categories, fontsize=12)
# Titer des graphiques et couleurs
ax.set_title(region, size=20, color=colors[i])
# grille
plt.grid(True,color="grey")
# On definit l'étendu de y entre -2 et 2 pour une bonne visualisation
plt.ylim(-2,2)
plt.savefig("assets/graphiques/PREPROCESSING_7_Ordre de grandeurs_Income.jpg", bbox_inches='tight')
Les pays sélectionnés font parti des region income qui ont les surfaces les plus étendues
df_spider2 = df_score_income_sc[['NET.USER','ECO','SEC','TER','POP']]
fig = plt.figure(1, figsize=(10,6))
res = sns.heatmap(df_spider2, square=True, linewidths=0.1, cmap='coolwarm')
res.set_yticklabels(res.get_ymajorticklabels(), fontsize = 14)
res.set_xticklabels(res.get_xmajorticklabels(), fontsize = 12)
plt.title("heatmap Indicateurs Income Score",fontsize=18,fontweight='bold')
# plt.savefig("assets/graphiques/PREPROCESSING_8_Heatmap indicateur Income.jpg")
plt.show()
# PANDAS
#Import des pays du monde de geopandas
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
# Préparation des données
# Map_1 : préparer les données côté geopandas
map_1 = world.sort_values(by=['iso_a3'])
map_1 = map_1.loc[:,['name','iso_a3']]
map_1 = map_1.rename(columns={'iso_a3':'iso3'})
map_1 = map_1.set_index('iso3')
# Map 2 : préparer les données côté EdStatsCountry
map_2=country.sort_values(by = ['Country Code'])
map_2=map_2.loc[:,['Country Code','Short Name']]
map_2=map_2.rename(columns={'Country Code':'iso3'})
map_2=map_2.set_index('iso3')
# Fusion des 2 dataframes
map_f = pd.merge(map_1,map_2, on='iso3', how='left')
map_f.reset_index(inplace=True)
# Recherche des noms de pays différents entre geopandas et notre jeu de données
rows = map_f[map_f['name'] != map_f['Short Name']]
rows.drop([0,7,8,39,50,56,144,163], inplace=True)
# Modifier World
df_comp=rows[rows['Short Name'].notnull()].sort_values(by='iso3')
noms_a_changer=df_comp['Short Name'].tolist()
codes_iso=df_comp['iso3'].tolist()
# Récupération des index geopandas des noms de pays à changer
index_geo=world[world['iso_a3'].isin(codes_iso)].index
# Récupération des codes ISO 3 de geopandas
codes_isoa3=world.loc[index_geo,'iso_a3'].tolist()
# Parcours les codes iso3 pour avoir une liste de noms de pays dans l'ordre de world
noms_a_changer=[]
[noms_a_changer.append(df_comp[df_comp['iso3'] == iso]['Short Name'].tolist()[0]) for iso in codes_isoa3]
# On remplace les noms de world par ceux de notre dataframe
world.loc[world.index[index_geo.tolist()],'name']=noms_a_changer
df_score.Pays[df_score.Pays=='Korea, Rep.']="Korea"
df_score.Pays[df_score.Pays=='Russian Federation'] = 'Russia'
df_score_geo = world.merge(df_score, left_on = 'name', right_on = 'Pays')[['geometry','Pays','Score']]
# import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1.axes_divider import make_axes_locatable
fig, ax = plt.subplots()
fig.set_size_inches(30,18)
# créer sous division des axes
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="2%", pad=0.1)
# graph
world.plot(legend = True,ax=ax, cax=cax, color='lightgrey',linestyle='-',edgecolor='black')
df_score_geo.plot(column=df_score_geo['Score'], legend = True, ax=ax, cax=cax, cmap='RdYlGn',linestyle="-",edgecolor='black')
ax.set_axis_off()
# plt.savefig("assets/graphiques/_MAP_1_Pays_indicateurs.jpg")
plt.show()
df_prediction = data.copy()
df_prediction = data[data['Country Name'].isin(liste_pays_retenus) & data['Indicator Code'].isin(indicateurs_retenus)]
df_prediction = df_prediction.melt(id_vars = ['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'],
value_vars = ['1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978',
'1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987',
'1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996',
'1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005',
'2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014',
'2015', '2016', '2017', '2020', '2025', '2030', '2035', '2040', '2045',
'2050', '2055', '2060', '2065', '2070', '2075', '2080', '2085', '2090',
'2095', '2100'], var_name = 'Year', value_name = 'Value')
df_prediction['Year'] = df_prediction['Year'].astype('int32')
df_prediction = df_prediction[df_prediction['Year'] > 2010]
# Representation graphique
fig = plt.figure(figsize=(20,20))
fig.suptitle("Evolution des indicateurs des pays sélectionner en fonction des années disponible dans dataEdStats",fontweight='bold', fontsize=20)
plt.subplot(5,2,1)
for i in liste_pays_retenus:
sns.lineplot(
x=df_prediction[df_prediction['Indicator Code'] == 'IT.NET.USER.P2'][df_prediction['Country Name'] == i ]['Year'],
y=df_prediction[df_prediction['Indicator Code'] == 'IT.NET.USER.P2'][df_prediction['Country Name'] == i ]['Value'],
label = i,
linewidth=2.5,
legend=False,
)
#plt.legend(bbox_to_anchor=(1, 1), loc = 2)
plt.title("NUMERIQUE", size=20,weight='bold')
plt.ylabel("%",rotation=0)
plt.xlabel("Années")
plt.grid(False)
plt.subplot(5,2,2)
for i in liste_pays_retenus:
sns.lineplot(
x=df_prediction[df_prediction['Indicator Code'] == 'NY.GNP.PCAP.PP.CD'][df_prediction['Country Name'] == i ]['Year'],
y=df_prediction[df_prediction['Indicator Code'] == 'NY.GNP.PCAP.PP.CD'][df_prediction['Country Name'] == i ]['Value'],
label = i,
linewidth=2.5,
legend=False
)
#plt.legend(bbox_to_anchor=(1, 1), loc = 2)
plt.title("ECONOMIE", size=20,weight='bold')
plt.ylabel("",rotation=0)
plt.xlabel("Années")
plt.grid(False)
plt.subplot(5,2,3)
for i in liste_pays_retenus:
sns.lineplot(
x=df_prediction[df_prediction['Indicator Code'] == 'SP.POP.1524.TO.UN'][df_prediction['Country Name'] == i ]['Year'],
y=df_prediction[df_prediction['Indicator Code'] == 'SP.POP.1524.TO.UN'][df_prediction['Country Name'] == i ]['Value'],
label = i,
linewidth=2.5,
legend=False
)
#plt.legend(bbox_to_anchor=(1, 1), loc = 2)
plt.title("POPULATION", size=20,weight='bold')
plt.ylabel("",rotation=0)
plt.xlabel("Années")
plt.grid(False)
plt.subplot(5,2,4)
for i in liste_pays_retenus:
sns.lineplot(
x=df_prediction[df_prediction['Indicator Code'] == 'SE.SEC.ENRR'][df_prediction['Country Name'] == i ]['Year'],
y=df_prediction[df_prediction['Indicator Code'] == 'SE.SEC.ENRR'][df_prediction['Country Name'] == i ]['Value'],
label = i,
linewidth=2.5,
legend=False
)
# plt.legend(bbox_to_anchor=(1, 1), loc = 2)
plt.title("ETUDES SECONDAIRES", size=20,weight='bold')
plt.ylabel("%",rotation=0)
plt.xlabel("Années")
plt.grid(False)
plt.subplot(5,2,5)
for i in liste_pays_retenus:
sns.lineplot(
x=df_prediction[df_prediction['Indicator Code'] == 'SE.TER.ENRR'][df_prediction['Country Name'] == i ]['Year'],
y=df_prediction[df_prediction['Indicator Code'] == 'SE.TER.ENRR'][df_prediction['Country Name'] == i ]['Value'],
label = i,
linewidth=2.5
)
plt.legend(bbox_to_anchor=(1.5, 1), loc = 2)
plt.title("ETUDES SUPERIEURES", size=20,weight='bold')
plt.ylabel("%",rotation=0)
plt.xlabel("Années")
plt.grid(False)
plt.subplots_adjust(left=0.125,
bottom=0.1,
right=0.9,
top=0.9,
wspace=0.2,
hspace=0.5)
plt.savefig("assets/graphiques/PREDICTION_1_Synthese.jpg", bbox_inches='tight')
plt.show()
plt.figure(figsize=(12,8))
for i in liste_pays_retenus:
sns.lineplot(
x=df_prediction[df_prediction['Indicator Code'] == 'IT.NET.USER.P2'][df_prediction['Country Name'] == i ]['Year'],
y=df_prediction[df_prediction['Indicator Code'] == 'IT.NET.USER.P2'][df_prediction['Country Name'] == i ]['Value'],
label = i,
linewidth=2.5,
legend=False,
)
plt.legend(bbox_to_anchor=(1, 1), loc = 2)
plt.title("IT.NET.USER.P2", size=20,weight='bold')
plt.ylabel("%",rotation=0)
plt.xlabel("Années")
plt.grid(False)
plt.savefig("assets/graphiques/PREDICTION_2_IT.NET.USER.P2.jpg", bbox_inches='tight')
plt.show()
plt.figure(figsize=(12,8))
for i in liste_pays_retenus:
sns.lineplot(
x=df_prediction[df_prediction['Indicator Code'] == 'NY.GNP.PCAP.PP.CD'][df_prediction['Country Name'] == i ]['Year'],
y=df_prediction[df_prediction['Indicator Code'] == 'NY.GNP.PCAP.PP.CD'][df_prediction['Country Name'] == i ]['Value'],
label = i,
linewidth=2.5,
legend=False
)
plt.legend(bbox_to_anchor=(1, 1), loc = 2)
plt.title("NY.GNP.PCAP.PP.CD", size=20,weight='bold')
plt.ylabel("",rotation=0)
plt.xlabel("Années")
plt.grid(False)
plt.savefig("assets/graphiques/PREDICTION_3_IT.NET.USER.P2.jpg", bbox_inches='tight')
plt.show()
plt.figure(figsize=(12,8))
for i in liste_pays_retenus:
sns.lineplot(
x=df_prediction[df_prediction['Indicator Code'] == 'SP.POP.1524.TO.UN'][df_prediction['Country Name'] == i ]['Year'],
y=df_prediction[df_prediction['Indicator Code'] == 'SP.POP.1524.TO.UN'][df_prediction['Country Name'] == i ]['Value'],
label = i,
linewidth=2.5,
legend=False
)
plt.legend(bbox_to_anchor=(1, 1), loc = 2)
plt.title('SP.POP.1524.TO.UN', size=20,weight='bold')
plt.ylabel("",rotation=0)
plt.xlabel("Années")
plt.grid(False)
plt.savefig("assets/graphiques/PREDICTION_4_SP.POP.1524.TO.UN.jpg", bbox_inches='tight')
plt.show()
plt.figure(figsize=(12,8))
for i in liste_pays_retenus:
sns.lineplot(
x=df_prediction[df_prediction['Indicator Code'] == 'SE.SEC.ENRR'][df_prediction['Country Name'] == i ]['Year'],
y=df_prediction[df_prediction['Indicator Code'] == 'SE.SEC.ENRR'][df_prediction['Country Name'] == i ]['Value'],
label = i,
linewidth=2.5,
legend=False
)
plt.legend(bbox_to_anchor=(1, 1), loc = 2)
plt.title("SE.SEC.ENRR", size=20,weight='bold')
plt.ylabel("%",rotation=0)
plt.xlabel("Années")
plt.grid(False)
plt.savefig("assets/graphiques/PREDICTION_5_SE.SEC.ENRR.jpg", bbox_inches='tight')
plt.show()
plt.figure(figsize=(12,8))
for i in liste_pays_retenus:
sns.lineplot(
x=df_prediction[df_prediction['Indicator Code'] == 'SE.TER.ENRR'][df_prediction['Country Name'] == i ]['Year'],
y=df_prediction[df_prediction['Indicator Code'] == 'SE.TER.ENRR'][df_prediction['Country Name'] == i ]['Value'],
label = i,
linewidth=2.5
)
plt.legend(bbox_to_anchor=(1, 1), loc = 2)
plt.title("SE.TER.ENRR", size=20,weight='bold')
plt.ylabel("%",rotation=0)
plt.xlabel("Années")
plt.grid(False)
plt.savefig("assets/graphiques/PREDICTION_6_SE.TER.ENRR.jpg", bbox_inches='tight')
plt.show()